In [1]:
import transformers
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer, EarlyStoppingCallback,IntervalStrategy
from datasets import load_dataset, load_metric
from torch.utils.data import DataLoader
import numpy as np
import pandas as pd
import nltk
import torch
nltk.download('punkt')

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
train_df = pd.read_csv('dataset/train.csv')
val_df = pd.read_csv('dataset/val.csv')
test_df = pd.read_csv('dataset/test.csv')
peer_df = pd.read_csv('dataset/human_peer_evaluation.csv')

In [3]:
max_input = 150
max_target = 50
batch_size = 3
model_checkpoints = "facebook/bart-base"

In [4]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoints)

In [5]:
def process_input(r):
    # print( f"{r['sentence1']} {r['sentence2']} {r['sentence3']} {r['sentence4']}")
    text = f"{r['sentence1']} {r['sentence2']} {r['sentence3']} {r['sentence4']}"
    return text

In [6]:
train_x = [process_input(r[1]) for r in train_df.iterrows()]
train_y = [r[1]['sentence5'] for r in train_df.iterrows()]

val_x = [process_input(r[1]) for r in val_df.iterrows()]
val_y = [r[1]['sentence5'] for r in val_df.iterrows()]

test_x = [process_input(r[1]) for r in test_df.iterrows()]
test_y = [r[1]['sentence5'] for r in test_df.iterrows()]

peer_x = [process_input(r[1]) for r in peer_df.iterrows()]

In [7]:
# def prep_df(df):
#     input_x = []
#     input_y = []

In [8]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoints)

In [9]:
class RocDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.texts = encodings
        self.targets = labels

    def __getitem__(self, idx):
        item = dict()
        item['input_ids'] = self.texts[idx]['input_ids'] 
        item['attention_mask'] = self.texts[idx]['attention_mask'] 
        item['labels'] = self.targets[idx]
        return item

    def __len__(self):
        return len(self.targets)

In [10]:
def preprocess_data(text):
    model_input = tokenizer(text,  max_length=max_input, padding='max_length', truncation=True)
    return model_input

In [11]:
train_tokenized_x = [preprocess_data(t) for t in train_x]
train_tokenized_y = [preprocess_data(t)['input_ids'] for t in train_y]

val_tokenized_x = [preprocess_data(t) for t in val_x]
val_tokenized_y = [preprocess_data(t)['input_ids'] for t in val_y]

test_tokenized_x = [preprocess_data(t) for t in test_x]
test_tokenized_y = [preprocess_data(t)['input_ids'] for t in test_y]

peer_tokenized_x = [preprocess_data(t) for t in peer_x]


In [12]:
len(train_tokenized_x[0][0])

150

In [13]:
# sampling_size = 1000

# train_tokenized_x = train_tokenized_x[:sampling_size]
# train_tokenized_y = train_tokenized_y[:sampling_size]

# val_tokenized_x = val_tokenized_x[:sampling_size]
# val_tokenized_y = val_tokenized_y [:sampling_size]

# test_tokenized_x = test_tokenized_x[:sampling_size]
# test_tokenized_y = test_tokenized_y [:sampling_size]

In [14]:
train_data= RocDataset(train_tokenized_x,train_tokenized_y)
val_data= RocDataset(val_tokenized_x,val_tokenized_y)
test_data= RocDataset(test_tokenized_x,test_tokenized_y)

In [15]:
train_dataloader = DataLoader(train_data, batch_size=10, shuffle=True)
val_dataloader = DataLoader(val_data, batch_size=10, shuffle=True)
test_dataloader = DataLoader(test_data, batch_size=10, shuffle=True)

## Training process

In [16]:
batch_size = 10

In [17]:
#load model
model = transformers.AutoModelForSeq2SeqLM.from_pretrained(model_checkpoints)

metric = load_metric('rouge')

  metric = load_metric('rouge')


In [18]:
#collator to create batches. It preprocess data with the given tokenizer
collator = transformers.DataCollatorForSeq2Seq(tokenizer, model=model)

In [19]:
#####################
# metrics
# compute rouge for evaluation 
#####################

def compute_rouge(pred):
    predictions, labels = pred
    #decode the predictions
    decode_predictions = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    #decode labels
    decode_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    #compute results
    res = metric.compute(predictions=decode_predictions, references=decode_labels, use_stemmer=True)
    #get %
    res = {key: value.mid.fmeasure * 100 for key, value in res.items()}

    pred_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    res['gen_len'] = np.mean(pred_lens)

    return {k: round(v, 4) for k, v in res.items()}

In [41]:
# training_args = TrainingArguments(
#     output_dir="./gpt2-gerchef", #The output directory
#     overwrite_output_dir=True, #overwrite the content of the output directory
#     num_train_epochs=3, # number of training epochs
#     per_device_train_batch_size=32, # batch size for training
#     per_device_eval_batch_size=64,  # batch size for evaluation
#     eval_steps = 400, # Number of update steps between two evaluations.
#     save_steps=800, # after # steps model is saved 
#     warmup_steps=500,# number of warmup steps for learning rate scheduler
#     prediction_loss_only=True,
# )


In [20]:
args = transformers.Seq2SeqTrainingArguments(
    'story_generation',
    evaluation_strategy=IntervalStrategy.STEPS,
    eval_steps = 4000,
    save_steps= 4000,
    learning_rate=2e-5,
    per_device_train_batch_size=5,
    per_device_eval_batch_size= 5,
    gradient_accumulation_steps=2,
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=30,
    predict_with_generate=True,
    eval_accumulation_steps=1,
    load_best_model_at_end=True,
    fp16=True 
    )
#only CUDA available -> fp16=True

In [21]:
trainer = transformers.Seq2SeqTrainer(
    model, 
    args,
    train_dataset=train_data,
    eval_dataset=val_data,
    data_collator=collator,
    tokenizer=tokenizer,
    callbacks = [EarlyStoppingCallback(early_stopping_patience=3)],
    compute_metrics=compute_rouge
)

Using cuda_amp half precision backend


In [22]:
trainer.train()

***** Running training *****
  Num examples = 40000
  Num Epochs = 30
  Instantaneous batch size per device = 5
  Total train batch size (w. parallel, distributed & accumulation) = 10
  Gradient Accumulation steps = 2
  Total optimization steps = 120000
  Number of trainable parameters = 139420416
You're using a BartTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
4000,0.1972,0.181272,22.1409,5.5984,20.6267,20.6293,12.756
8000,0.182,0.180543,22.4091,5.9986,20.9688,20.9643,12.3044
12000,0.1696,0.180078,23.1205,6.145,21.2748,21.2867,13.3002
16000,0.1572,0.182617,23.5629,6.4063,21.8508,21.8587,12.2988
20000,0.1459,0.184119,24.0372,6.7075,22.3179,22.3195,12.7212
24000,0.1368,0.188432,23.9717,6.6272,22.1853,22.1841,12.4568


***** Running Evaluation *****
  Num examples = 5000
  Batch size = 5
Saving model checkpoint to story_generation\checkpoint-4000
Configuration saved in story_generation\checkpoint-4000\config.json
Model weights saved in story_generation\checkpoint-4000\pytorch_model.bin
tokenizer config file saved in story_generation\checkpoint-4000\tokenizer_config.json
Special tokens file saved in story_generation\checkpoint-4000\special_tokens_map.json
***** Running Evaluation *****
  Num examples = 5000
  Batch size = 5
Saving model checkpoint to story_generation\checkpoint-8000
Configuration saved in story_generation\checkpoint-8000\config.json
Model weights saved in story_generation\checkpoint-8000\pytorch_model.bin
tokenizer config file saved in story_generation\checkpoint-8000\tokenizer_config.json
Special tokens file saved in story_generation\checkpoint-8000\special_tokens_map.json
Deleting older checkpoint [story_generation\checkpoint-500] due to args.save_total_limit
***** Running Evaluatio

TrainOutput(global_step=24000, training_loss=0.19162510013580322, metrics={'train_runtime': 8545.5261, 'train_samples_per_second': 140.424, 'train_steps_per_second': 14.042, 'total_flos': 2.143604736e+16, 'train_loss': 0.19162510013580322, 'epoch': 6.0})

In [23]:
trainer.save_model('model')

Saving model checkpoint to model
Configuration saved in model\config.json
Model weights saved in model\pytorch_model.bin
tokenizer config file saved in model\tokenizer_config.json
Special tokens file saved in model\special_tokens_map.json


In [24]:
test_sample = test_x[0]

In [25]:
model_inputs = tokenizer(test_sample,  max_length=max_input, padding='max_length', truncation=True)

In [26]:
raw_pred, _, _ = trainer.predict([model_inputs])

***** Running Prediction *****
  Num examples = 1
  Batch size = 5


In [27]:
tokenizer.batch_decode(raw_pred, skip_special_tokens=True)

["The pizza was delivered to Jimmy's house the next day."]

In [28]:
test_sample

'Jimmy was too lazy to cook dinner. He decided to order a mushroom pizza to his home. When his pizza arrived, he got an anchovy pizza instead. Jimmy complained about his incorrect order through the phone.'

In [29]:
tokenizer.decode(raw_pred[0])

"</s><s>The pizza was delivered to Jimmy's house the next day.</s><pad><pad><pad><pad><pad>"

In [30]:
test_y[0]

'He was able to get a new pizza delivered to his home.'

In [31]:
# trainer.predict(train_tokenized_x)

In [32]:
train_prediction = trainer.predict(train_tokenized_x)

***** Running Prediction *****
  Num examples = 40000
  Batch size = 5


In [33]:
val_prediction = trainer.predict(val_tokenized_x)

***** Running Prediction *****
  Num examples = 5000
  Batch size = 5


In [34]:
test_prediction = trainer.predict(test_tokenized_x)

***** Running Prediction *****
  Num examples = 5000
  Batch size = 5


In [35]:
peer_prediction = trainer.predict(peer_tokenized_x)

***** Running Prediction *****
  Num examples = 50
  Batch size = 5


In [36]:
decode_train_prediction = [tokenizer.decode(p,skip_special_tokens=True) for p in train_prediction[0]]
decode_val_prediction = [tokenizer.decode(p,skip_special_tokens=True) for p in val_prediction[0]]
decode_test_prediction = [tokenizer.decode(p,skip_special_tokens=True) for p in test_prediction[0]]
decode_peer_prediction = [tokenizer.decode(p,skip_special_tokens=True) for p in peer_prediction[0]]

In [37]:
train_df['prediction'] = decode_train_prediction 
val_df['prediction'] = decode_val_prediction 
test_df['prediction'] = decode_test_prediction 
peer_df['prediction'] = decode_peer_prediction

In [38]:
train_df.to_csv('dataset/train_prediction.csv',index=False)
val_df.to_csv('dataset/val_prediction.csv',index=False)
test_df.to_csv('dataset/test_prediction.csv',index=False)
peer_df.to_csv('dataset/peer_prediction.csv',index=False)