In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES']='1,2'
#import wget
import sys
sys.path.append('../')
import argparse
import json
import pandas as pd
import random
import numpy as np
import string
import nltk
from functools import partial
import re
from  tqdm import tqdm
import torch
nltk.download('punkt')
from src.dataset_classes import DatasetObject,Features,SquadQuestionGenerationDataset
from src.utils import answerGeneratorDataset,questionGeneratorDataset,buildFact,setuptokenizer,pad_seq,SmartCollator,process_extra
from dataclasses import dataclass, field
from nltk.tokenize import sent_tokenize

[nltk_data] Downloading package punkt to /home/nlplab/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
extra_train = pd.read_csv('../datasets/processed_new_data.csv').dropna()
squad_train = pd.read_csv('../datasets/train-v2.0.csv').dropna()
squad_test = pd.read_csv('../datasets/test-v2.0.csv').dropna()

train_raw_data = squad_train[['question', 'is_impossible', 'title', 'context', 'answer',
       'answer_start', 'answer_end']]
test_raw_data = squad_test[['question', 'is_impossible', 'title', 'context', 'answer',
       'answer_start', 'answer_end']]#.sample(5000).values

In [4]:
train_raw_data.shape

(130315, 7)

In [4]:
train_data_packet = questionGeneratorDataset(train_raw_data,80000) + answerGeneratorDataset(train_raw_data,80000) + process_extra(extra_train)
random.shuffle(train_data_packet)
random.shuffle(train_data_packet)

test_data_packet = questionGeneratorDataset(test_raw_data,6000) + answerGeneratorDataset(test_raw_data,5000)
random.shuffle(test_data_packet)

80000it [00:04, 19423.01it/s]
80000it [00:08, 9790.33it/s] 
129it [00:00, 678026.59it/s]
6000it [00:00, 18494.65it/s]
5000it [00:00, 9392.38it/s]


In [7]:
model_base = 'facebook/bart-base'
tokenizer = setuptokenizer(model_base=model_base,
                           special_tokens=['<section>','</section>'
                                           ,'<generate_questions>',
                                           '<generate_answers>'])

In [8]:
# compose the datasets
train_dataset = SquadQuestionGenerationDataset(tokenizer=tokenizer,nb_records=len(train_raw_data),highlight_section=False)
train_dataset.change_data_mode(1)
train_dataset.set_record(train_data_packet)

test_dataset = SquadQuestionGenerationDataset(tokenizer=tokenizer,nb_records=len(train_raw_data),highlight_section=False)
test_dataset.change_data_mode(1)
test_dataset.set_record(test_data_packet)

In [9]:
len(train_dataset)

159917

In [9]:
from transformers import BartForConditionalGeneration
# Set up the model
def model_init(device =  torch.device('cuda:1') if torch.cuda.is_available() else torch.device('cpu') ):
    generator = BartForConditionalGeneration.from_pretrained(model_base)
    # update the tokens 
    generator.resize_token_embeddings(len(tokenizer))
    return generator.to(device)

In [11]:
from typing import Optional,Union,Callable,Dict,List,Tuple
from transformers import TrainingArguments,Trainer,PreTrainedModel,DataCollator,PreTrainedTokenizerBase,EvalPrediction,EarlyStoppingCallback,TrainerCallback,ProgressCallback
import torch.nn as nn
from torch.utils.data import DataLoader,Dataset
class CustomTrainer(Trainer):
  def __init__(self,device= torch.device('cuda:1') if torch.cuda.is_available() else torch.device('cpu'), model: Union[PreTrainedModel, nn.Module] = None, args: TrainingArguments = None, data_collator: Optional[DataCollator] = None, train_dataset: Optional[Dataset] = None, eval_dataset: Optional[Dataset] = None, tokenizer: Optional[PreTrainedTokenizerBase] = None, model_init: Callable[[], PreTrainedModel] = None, compute_metrics: Optional[Callable[[EvalPrediction], Dict]] = None, callbacks: Optional[List[TrainerCallback]] = None, optimizers: Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (None,None), preprocess_logits_for_metrics: Callable[[torch.Tensor, torch.Tensor], torch.Tensor] = None):
    super().__init__(model, args, data_collator, train_dataset, eval_dataset, tokenizer, model_init, compute_metrics, callbacks, optimizers, preprocess_logits_for_metrics)
    self.device = device
  def compute_loss(self, model, batch, return_outputs=False):
    
    b_input_ids = batch['input_ids'].to(self.device)
    b_input_mask = batch['attention_mask'].to(self.device)
    b_labels =batch['labels'].to(self.device)
    decoder_attention_mask = batch['decoder_attention_mask'].to(self.device)

    #print(b_input_ids.shape)
    outputs = model(b_input_ids, attention_mask = b_input_mask, decoder_attention_mask = decoder_attention_mask,
                             labels = b_labels)
    loss = outputs["loss"] if isinstance(outputs, dict) else outputs[0]
    return (loss, outputs) if return_outputs else loss

In [12]:
train_args = TrainingArguments(overwrite_output_dir=True, output_dir='trained_models/setup_1/',
                               evaluation_strategy='steps',
                               lr_scheduler_type='cosine',
                               adafactor=False,
                               load_best_model_at_end=True,
                               save_total_limit=1,
                               weight_decay=0.3,
                               warmup_ratio=0.21,
                               num_train_epochs=5,
                               per_device_train_batch_size=16,

                               )


In [None]:
custom_trainer = CustomTrainer(model_init= model_init,args=train_args, train_dataset=train_dataset, eval_dataset=test_dataset, 
                               data_collator=SmartCollator(
                            pad_token_id=train_dataset.tokenizer.pad_token_id),callbacks=[EarlyStoppingCallback(early_stopping_patience=4)])

In [14]:
custom_trainer.device

device(type='cuda', index=1)

In [15]:
custom_trainer.train()

loading configuration file config.json from cache at /home/nlplab/.cache/huggingface/hub/models--facebook--bart-base/snapshots/84358834e73de6a82c22cec1d90eb45ef4f6eba5/config.json
Model config BartConfig {
  "_name_or_path": "bart-base",
  "activation_dropout": 0.1,
  "activation_function": "gelu",
  "add_bias_logits": false,
  "add_final_layer_norm": false,
  "architectures": [
    "BartModel"
  ],
  "attention_dropout": 0.1,
  "bos_token_id": 0,
  "classif_dropout": 0.1,
  "classifier_dropout": 0.0,
  "d_model": 768,
  "decoder_attention_heads": 12,
  "decoder_ffn_dim": 3072,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 6,
  "decoder_start_token_id": 2,
  "dropout": 0.1,
  "early_stopping": true,
  "encoder_attention_heads": 12,
  "encoder_ffn_dim": 3072,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 6,
  "eos_token_id": 2,
  "forced_bos_token_id": 0,
  "forced_eos_token_id": 2,
  "gradient_checkpointing": false,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "

Step,Training Loss,Validation Loss
500,1.0121,0.699232
1000,0.7016,0.658765
1500,0.6612,0.650026
2000,0.6438,0.642064
2500,0.6385,0.642977
3000,0.5764,0.635159
3500,0.5835,0.636798
4000,0.5562,0.632436
4500,0.5754,0.627813
5000,0.558,0.62813


***** Running Evaluation *****
  Num examples = 9982
  Batch size = 16
Saving model checkpoint to trained_models/setup_1/checkpoint-500
Configuration saved in trained_models/setup_1/checkpoint-500/config.json
Model weights saved in trained_models/setup_1/checkpoint-500/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 9982
  Batch size = 16
Saving model checkpoint to trained_models/setup_1/checkpoint-1000
Configuration saved in trained_models/setup_1/checkpoint-1000/config.json
Model weights saved in trained_models/setup_1/checkpoint-1000/pytorch_model.bin
Deleting older checkpoint [trained_models/setup_1/checkpoint-500] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 9982
  Batch size = 16
Saving model checkpoint to trained_models/setup_1/checkpoint-1500
Configuration saved in trained_models/setup_1/checkpoint-1500/config.json
Model weights saved in trained_models/setup_1/checkpoint-1500/pytorch_model.bin
Deleting older checkpoint [trained_

TrainOutput(global_step=9500, training_loss=0.5488569207442434, metrics={'train_runtime': 7117.8339, 'train_samples_per_second': 56.261, 'train_steps_per_second': 1.758, 'total_flos': 5.898681838393344e+16, 'train_loss': 0.5488569207442434, 'epoch': 3.8})

In [16]:
from nltk.util import ngrams
import wikipedia
def factgenerator(document,n):
    return list(ngrams(sent_tokenize(document.strip()),n))



In [19]:
article = wikipedia.summary('pan cake')
n=3
facts = [' '.join(s).replace('\n','').strip() for s in factgenerator(article,n=n)]

In [18]:
article

'A pancake (or hot-cake, griddlecake, or flapjack) is a flat cake, often thin and round, prepared from a starch-based batter that may contain eggs, milk and butter and cooked on a hot surface such as a griddle or frying pan, often frying with oil or butter. It is a type of batter bread. Archaeological evidence suggests that pancakes were probably eaten in prehistoric societies.The pancake\'s shape and structure varies worldwide. In the United Kingdom, pancakes are often unleavened and resemble a crêpe. In North America, a leavening agent is used (typically baking powder) creating a thick fluffy pancake. A crêpe is a thin Breton pancake of French origin cooked on one or both sides in a special pan or crepe maker to achieve a lacelike network of fine bubbles. A well-known variation originating from southeast Europe is a palačinke, a thin moist pancake fried on both sides and filled with jam, cream cheese, chocolate, or ground walnuts, but many other fillings—sweet or savoury—can also be 

In [20]:
sample_too = True
sampling_helper = {} if not sample_too else dict(top_k=30, top_p=0.95,)
max_length=250
length_penalty=2.6
beam_size=4
repetition_penalty=1.56
return_top_beams= beam_size if not sample_too else 10

In [21]:
dataset = SquadQuestionGenerationDataset(tokenizer,nb_records=1)
dataset.change_data_mode(1)

In [24]:
generator = custom_trainer.model
device = custom_trainer.model.device

In [32]:
task_id = 0
target_fact = facts[10]
#' item[COA], volume[32m3],  ratio[0.06]  Table 1: Chemical combination rule for working with N2 gas.'
data = DatasetObject(task='<generate_questions> ', question='', context=target_fact ,fact=target_fact ,answer='',answer_sentence='',task_id="")

batch = dataset.procesTexts(data)

b_input_ids = batch.input_ids.view(1,-1).to(device)
b_input_mask = batch.attention_mask.view(1,-1).to(device)

sample_too = True
sampling_helper = {} if not sample_too else dict(top_k=25, top_p=0.95,)
return_top_beams= beam_size if not sample_too else 10
#seed_everything(2982)
with torch.no_grad():
  sample_outputs = generator.generate(input_ids=b_input_ids,  **sampling_helper,
                                                 attention_mask=b_input_mask ,
                                                 num_beams=beam_size,
                                                 repetition_penalty=repetition_penalty,
                                                 length_penalty=length_penalty,
                                                 early_stopping=True,
                                                 use_cache=True,
                                                 max_length=max_length,
                                                 no_repeat_ngram_size=2,
                                                 num_return_sequences=return_top_beams,
                                                 do_sample=sample_too,
                                                 eos_token_id=dataset.tokenizer.eos_token_id,)
oop = [dataset.tokenizer.decode(sample_outputs[idx],
                                     skip_special_tokens=True,
                                     clean_up_tokenization_spaces=True) for idx in range(return_top_beams)]

print(f'Article Section: {data.context}')
print('Questions Generated')
for q in oop:
  print(q)

Article Section: Buckwheat flour can be used in a pancake batter, making for a type of buckwheat pancake, a category that includes Blini, Kaletez, Ploye, and Memil-buchimgae. When potato is used as a major portion of the batter, the result is a potato pancake. Pancakes may be served at any time of the day or year with a variety of toppings or fillings, but they have developed associations with particular times and toppings in different regions.
Questions Generated
What is the result of a potato pancake when potato is used as a major part of the batter?
What is the result of a potato pancake when used as a large part of the batter?
What is the result of a potato pancake when potato is used as a major part of the batter?
What is the result of a potato pancake when potato is used as a major part of the batter?
What is the result of a potato pancake when used as a large part of the batter?
What is the result of a potato pancake when potato is used as a major part of the batter?
What is the