In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES']='1'

In [2]:
import json
import jsonlines
import re
import transformers
from datasets import load_dataset, load_metric

### 處理data

In [4]:
with open('prepare_data/pattern4.json', encoding='utf-8') as f:
    data = json.load(f)

In [10]:
new_data = {}
verb_data = {}
for i in data:
    data_list = []
    verb_list = []
    for j in data[i]:
        text = j['example'].replace('<span class="x">','').replace('<span class="cl">','').replace('</span>','').strip()
        new_text = ''
        test = 0
        for k in text.split(' '):
            new_text += ' '
            new_text += k
            if i == k:
                test += 1
                break
        if test != 0:
            data_list.append({'pattern': j['pattern'], 'text': new_text.strip()})
        if test == 0:
            verb_list.append({'pattern': j['pattern'], 'text': new_text.strip()})
    if data_list != []:
        new_data[i] = data_list
    if verb_list != []:
        verb_data[i] = verb_list

In [13]:
print(new_data['abandon'])
print(verb_data['abandon'])

[{'pattern': 'abandon something', 'text': 'Snow forced many drivers to abandon'}, {'pattern': 'abandon something', 'text': 'Snow forced many drivers to abandon'}, {'pattern': 'abandon something to somebody/something', 'text': 'They had to abandon'}]
[{'pattern': 'abandon somebody', 'text': 'The baby had been abandoned by its mother.'}, {'pattern': 'abandon somebody', 'text': 'The baby had been abandoned by its mother.'}, {'pattern': 'abandon somebody to something', 'text': '‘We have been abandoned to our fate,’ said one resident.'}, {'pattern': 'abandon something', 'text': 'They abandoned the match because of rain.'}, {'pattern': 'abandon somebody', 'text': 'The country abandoned its political leaders after the war.'}, {'pattern': 'abandon somebody', 'text': 'The country abandoned its political leaders after the war.'}, {'pattern': 'abandon something', 'text': 'By 1930 he had abandoned his Marxist principles.'}]


In [23]:
len(verb_data)

3663

In [14]:
with open('data.json', 'w', encoding='utf-8') as f:
    json.dump(new_data, f, indent = 4)

### 改成dataset形式

In [15]:
with open('data.json', encoding='utf-8') as f:
    data = json.load(f)

In [16]:
with jsonlines.open('data.jsonlines','w') as f:
    for i in data:
        for j in data[i]:
            f.write(j)

In [3]:
with open("data.jsonlines","r") as f:
    temp = set(f.readlines())
with open("data1.jsonlines","w") as w:
    for i in temp:
        w.write(i)

### import dataset

In [9]:
datasets = load_dataset("json", data_files="verb1.jsonlines")

In [4]:
datasets

DatasetDict({
    train: Dataset({
        features: ['pattern', 'text'],
        num_rows: 17352
    })
})

In [10]:
datasets_train_test = datasets["train"].train_test_split(test_size=1500)

datasets["train"] = datasets_train_test["train"]
datasets["validation"] = datasets_train_test["test"]

In [12]:
datasets_train_test

DatasetDict({
    train: Dataset({
        features: ['pattern', 'text'],
        num_rows: 15852
    })
    test: Dataset({
        features: ['pattern', 'text'],
        num_rows: 1500
    })
})

In [13]:
import nltk
nltk.download('punkt')
import string
from transformers import AutoTokenizer

model_checkpoint = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

[nltk_data] Downloading package punkt to
[nltk_data]     /home/nlplab/maggie/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [14]:
prefix = "summarize: "
max_input_length = 256
max_target_length = 64

def preprocess_data(examples):
    inputs = [prefix + text for text in examples["text"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["pattern"], max_length=max_target_length, 
                       truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [15]:
tokenized_datasets = datasets.map(preprocess_data, batched=True)

Map:   0%|          | 0/15852 [00:00<?, ? examples/s]



Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

In [16]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['pattern', 'text', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 15852
    })
    validation: Dataset({
        features: ['pattern', 'text', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 1500
    })
})

In [17]:
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainer, Seq2SeqTrainingArguments

In [18]:
batch_size = 8
model_name = "t5-base-medium-title-generation"
model_dir = f"t5_small"

args = Seq2SeqTrainingArguments(
    model_dir,
    evaluation_strategy="steps",
    eval_steps=100,
    logging_strategy="steps",
    logging_steps=100,
    save_strategy="steps",
    save_steps=200,
    learning_rate=4e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=1,
    predict_with_generate=True,
    fp16=True,
    load_best_model_at_end=True,
    metric_for_best_model="rouge1",
    report_to="tensorboard"
)



In [19]:
data_collator = DataCollatorForSeq2Seq(tokenizer)

In [20]:
metric = load_metric("rouge")

  metric = load_metric("rouge")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


In [21]:
import numpy as np

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # Rouge expects a newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip()))
                      for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) 
                      for label in decoded_labels]
    
    # Compute ROUGE scores
    result = metric.compute(predictions=decoded_preds, references=decoded_labels,
                            use_stemmer=True)

    # Extract ROUGE f1 scores
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
    
    # Add mean generated length to metrics
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id)
                      for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)
    
    return {k: round(v, 4) for k, v in result.items()}

In [22]:
# Function that returns an untrained model to be trained
def model_init():
    return AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

trainer = Seq2SeqTrainer(
    model_init=model_init,
    args=args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [24]:
trainer.train()

Step,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
100,4.846,3.034602,31.8629,8.5963,31.6104,31.591,8.198
200,2.8758,2.093965,50.0196,23.7218,49.7901,49.7634,5.0907
300,2.2126,1.745479,54.6981,28.3629,54.4789,54.452,4.5207
400,1.8895,1.542307,58.33,32.1448,57.9479,57.97,4.4993
500,1.6735,1.438499,60.3186,34.3282,59.9974,59.9949,4.3447
600,1.6281,1.35246,61.2631,34.9976,60.9631,60.992,4.35
700,1.5037,1.288601,62.4904,36.817,62.1825,62.1993,4.4187
800,1.5134,1.238064,63.6614,38.3491,63.31,63.361,4.4493
900,1.454,1.217188,63.5281,38.4137,63.2363,63.28,4.3213
1000,1.4319,1.181065,63.9717,38.5783,63.6324,63.6736,4.374


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=1982, training_loss=1.7015805364737717, metrics={'train_runtime': 552.8646, 'train_samples_per_second': 28.672, 'train_steps_per_second': 3.585, 'total_flos': 74163077775360.0, 'train_loss': 1.7015805364737717, 'epoch': 1.0})

In [25]:
trainer.save_model('gen4')

### Test the model

In [20]:
#model_name = "t5-base-medium-title-generation/checkpoint-2000"
model_dir = f"gen4"

tokenizer = AutoTokenizer.from_pretrained(model_dir)
model = AutoModelForSeq2SeqLM.from_pretrained(model_dir)

max_input_length = 256

In [23]:
def gen(text):
    inputs = ["summarize: " + text]
    inputs = tokenizer(inputs, max_length=max_input_length, truncation=True, return_tensors="pt")
    output = model.generate(**inputs, num_beams=8, do_sample=True, min_length=2, max_length=128)
    decoded_output = tokenizer.batch_decode(output, skip_special_tokens=True)[0]
    predicted = nltk.sent_tokenize(decoded_output.strip())[0]
    return predicted

In [24]:
text = "Not all the facts are made available"

gen(text)

'available to somebody/something'