In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES']='1'

In [2]:
import json
import jsonlines
import re
import transformers
from datasets import load_dataset, load_metric

### 處理data

In [4]:
with open('prepare_data/pattern4.json', encoding='utf-8') as f:
    data = json.load(f)

In [10]:
new_data = {}
verb_data = {}
for i in data:
    data_list = []
    verb_list = []
    for j in data[i]:
        text = j['example'].replace('<span class="x">','').replace('<span class="cl">','').replace('</span>','').strip()
        new_text = ''
        test = 0
        for k in text.split(' '):
            new_text += ' '
            new_text += k
            if i == k:
                test += 1
                break
        if test != 0:
            data_list.append({'pattern': j['pattern'], 'text': new_text.strip()})
        if test == 0:
            verb_list.append({'pattern': j['pattern'], 'text': new_text.strip()})
    if data_list != []:
        new_data[i] = data_list
    if verb_list != []:
        verb_data[i] = verb_list

In [13]:
print(new_data['abandon'])
print(verb_data['abandon'])

[{'pattern': 'abandon something', 'text': 'Snow forced many drivers to abandon'}, {'pattern': 'abandon something', 'text': 'Snow forced many drivers to abandon'}, {'pattern': 'abandon something to somebody/something', 'text': 'They had to abandon'}]
[{'pattern': 'abandon somebody', 'text': 'The baby had been abandoned by its mother.'}, {'pattern': 'abandon somebody', 'text': 'The baby had been abandoned by its mother.'}, {'pattern': 'abandon somebody to something', 'text': '‘We have been abandoned to our fate,’ said one resident.'}, {'pattern': 'abandon something', 'text': 'They abandoned the match because of rain.'}, {'pattern': 'abandon somebody', 'text': 'The country abandoned its political leaders after the war.'}, {'pattern': 'abandon somebody', 'text': 'The country abandoned its political leaders after the war.'}, {'pattern': 'abandon something', 'text': 'By 1930 he had abandoned his Marxist principles.'}]


In [23]:
len(verb_data)

3663

In [14]:
with open('data.json', 'w', encoding='utf-8') as f:
    json.dump(new_data, f, indent = 4)

### 改成dataset形式

In [15]:
with open('data.json', encoding='utf-8') as f:
    data = json.load(f)

In [16]:
with jsonlines.open('data.jsonlines','w') as f:
    for i in data:
        for j in data[i]:
            f.write(j)

In [3]:
with open("data.jsonlines","r") as f:
    temp = set(f.readlines())
with open("data1.jsonlines","w") as w:
    for i in temp:
        w.write(i)

### import dataset

In [4]:
datasets = load_dataset("json", data_files="data1.jsonlines")

Generating train split: 0 examples [00:00, ? examples/s]

In [5]:
datasets

DatasetDict({
    train: Dataset({
        features: ['pattern', 'text'],
        num_rows: 8062
    })
})

In [6]:
datasets_train_test = datasets["train"].train_test_split(test_size=500)
datasets_train_validation = datasets_train_test["train"].train_test_split(test_size=500)

datasets["train"] = datasets_train_validation["train"]
datasets["validation"] = datasets_train_validation["test"]
datasets["test"] = datasets_train_test["test"]

In [7]:
import nltk
nltk.download('punkt')
import string
from transformers import AutoTokenizer

model_checkpoint = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

[nltk_data] Downloading package punkt to
[nltk_data]     /home/nlplab/maggie/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [8]:
prefix = "summarize: "
max_input_length = 256
max_target_length = 64

def preprocess_data(examples):
    inputs = [prefix + text for text in examples["text"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["pattern"], max_length=max_target_length, 
                       truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [9]:
tokenized_datasets = datasets.map(preprocess_data, batched=True)

Map:   0%|          | 0/7062 [00:00<?, ? examples/s]



Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

In [10]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['pattern', 'text', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 7062
    })
    validation: Dataset({
        features: ['pattern', 'text', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 500
    })
    test: Dataset({
        features: ['pattern', 'text', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 500
    })
})

In [11]:
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainer, Seq2SeqTrainingArguments

In [12]:
batch_size = 8
model_name = "t5-base-medium-title-generation"
model_dir = f"t5_small"

args = Seq2SeqTrainingArguments(
    model_dir,
    evaluation_strategy="steps",
    eval_steps=100,
    logging_strategy="steps",
    logging_steps=100,
    save_strategy="steps",
    save_steps=200,
    learning_rate=4e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=1,
    predict_with_generate=True,
    fp16=True,
    load_best_model_at_end=True,
    metric_for_best_model="rouge1",
    report_to="tensorboard"
)

In [13]:
data_collator = DataCollatorForSeq2Seq(tokenizer)

In [14]:
metric = load_metric("rouge")

  metric = load_metric("rouge")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


In [15]:
import numpy as np

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # Rouge expects a newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip()))
                      for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) 
                      for label in decoded_labels]
    
    # Compute ROUGE scores
    result = metric.compute(predictions=decoded_preds, references=decoded_labels,
                            use_stemmer=True)

    # Extract ROUGE f1 scores
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
    
    # Add mean generated length to metrics
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id)
                      for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)
    
    return {k: round(v, 4) for k, v in result.items()}

In [16]:
# Function that returns an untrained model to be trained
def model_init():
    return AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

trainer = Seq2SeqTrainer(
    model_init=model_init,
    args=args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [17]:
# Start TensorBoard before training to monitor it in progress
%load_ext tensorboard
%tensorboard --logdir 't5_small'/runs

ERROR: Failed to launch TensorBoard (exited with 1).
Contents of stderr:
Traceback (most recent call last):
  File "/home/nlplab/maggie/.local/bin/tensorboard", line 5, in <module>
    from tensorboard.main import run_main
  File "/usr/local/lib/python3.8/dist-packages/tensorboard/main.py", line 27, in <module>
    from tensorboard import default
  File "/usr/local/lib/python3.8/dist-packages/tensorboard/default.py", line 32, in <module>
    from tensorboard.plugins.audio import audio_plugin
  File "/usr/local/lib/python3.8/dist-packages/tensorboard/plugins/audio/audio_plugin.py", line 25, in <module>
    from tensorboard.data import provider
  File "/usr/local/lib/python3.8/dist-packages/tensorboard/data/__init__.py", line 17, in <module>
    from tensorboard.data import experimental  # noqa: F401
  File "/usr/local/lib/python3.8/dist-packages/tensorboard/data/experimental/__init__.py", line 17, in <module>
    from tensorboard.data.experimental.experiment_from_dev import (  # noqa: F

In [18]:
trainer.train()

Step,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
100,4.4104,2.491439,40.0633,10.6648,40.0503,39.9647,6.702
200,2.4686,1.736236,63.1805,30.6295,62.9654,62.9343,4.402
300,1.9406,1.485804,66.4584,34.6634,66.2945,66.2641,4.25
400,1.729,1.340472,68.0602,36.0646,67.8335,67.7615,4.274
500,1.5418,1.254006,69.97,37.8517,69.9063,69.8201,4.222
600,1.4764,1.211502,70.9713,39.6281,70.928,70.9322,4.274
700,1.4883,1.17386,71.4638,40.4429,71.3894,71.3704,4.322
800,1.4132,1.16399,71.3406,40.2589,71.3017,71.3054,4.326


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=883, training_loss=2.0008316904070154, metrics={'train_runtime': 108.8386, 'train_samples_per_second': 64.885, 'train_steps_per_second': 8.113, 'total_flos': 24558721499136.0, 'train_loss': 2.0008316904070154, 'epoch': 1.0})

In [19]:
trainer.save_model('gen3')

### Try the model

In [20]:
#model_name = "t5-base-medium-title-generation/checkpoint-2000"
model_dir = f"gen3"

tokenizer = AutoTokenizer.from_pretrained(model_dir)
model = AutoModelForSeq2SeqLM.from_pretrained(model_dir)

max_input_length = 256

In [23]:
def gen(text):
    inputs = ["summarize: " + text]
    inputs = tokenizer(inputs, max_length=max_input_length, truncation=True, return_tensors="pt")
    output = model.generate(**inputs, num_beams=8, do_sample=True, min_length=2, max_length=64)
    decoded_output = tokenizer.batch_decode(output, skip_special_tokens=True)[0]
    predicted = nltk.sent_tokenize(decoded_output.strip())[0]
    return predicted

In [24]:
text = "Not all the facts are made available"

gen(text)

'available to somebody/something'

### Evaluate the model on the test set

In [27]:
import torch

# get test split
test_tokenized_dataset = tokenized_datasets["test"]

# pad texts to the same length
def preprocess_test(examples):
    inputs = [prefix + text for text in examples["text"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True,
                           padding="max_length")
    return model_inputs

test_tokenized_dataset = test_tokenized_dataset.map(preprocess_test, batched=True)

# prepare dataloader
test_tokenized_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask'])
dataloader = torch.utils.data.DataLoader(test_tokenized_dataset, batch_size=32)

# generate text for each batch
all_predictions = []
for i,batch in enumerate(dataloader):
    predictions = model.generate(**batch)
    all_predictions.append(predictions)

# flatten predictions
all_predictions_flattened = [pred for preds in all_predictions for pred in preds]

# tokenize and pad titles
all_titles = tokenizer(test_tokenized_dataset["pattern"], max_length=max_target_length,
                       truncation=True, padding="max_length")["input_ids"]

# compute metrics
predictions_labels = [all_predictions_flattened, all_titles]
compute_metrics(predictions_labels)

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

{'rouge1': 48.4733,
 'rouge2': 4.4467,
 'rougeL': 48.4381,
 'rougeLsum': 48.3791,
 'gen_len': 2.392}

In [20]:
import torch

# get test split
test_tokenized_dataset = tokenized_datasets["test"]

# pad texts to the same length
def preprocess_test(examples):
    inputs = [prefix + text for text in examples["text"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True,
                           padding="max_length")
    return model_inputs

test_tokenized_dataset = test_tokenized_dataset.map(preprocess_test, batched=True)

# prepare dataloader
test_tokenized_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask'])
dataloader = torch.utils.data.DataLoader(test_tokenized_dataset, batch_size=64)

# generate text for each batch
all_predictions = []
for i,batch in enumerate(dataloader):
    predictions = model.generate(**batch)
    all_predictions.append(predictions)

# flatten predictions
all_predictions_flattened = [pred for preds in all_predictions for pred in preds]

# tokenize and pad titles
all_titles = tokenizer(test_tokenized_dataset["pattern"], max_length=max_target_length,
                       truncation=True, padding="max_length")["input_ids"]

# compute metrics
predictions_labels = [all_predictions_flattened, all_titles]
compute_metrics(predictions_labels)

Map:   0%|          | 0/500 [00:00<?, ? examples/s]



{'rouge1': 64.9055,
 'rouge2': 47.6276,
 'rougeL': 64.6296,
 'rougeLsum': 64.5236,
 'gen_len': 4.62}