In [None]:
import json
import numpy as np
import nltk 
import os
# os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
import torch
nltk.download('punkt')


FILES = ["/kaggle/input/claimdecomp/train.jsonl", "/kaggle/input/claimdecomp/dev.jsonl", "/kaggle/input/claimdecomp/test.jsonl"] 

data = []
for file in FILES:
    with open(file, 'rb') as f:
        data.append([{"claim": json.loads(d)['claim'], "questions": " ".join(json.loads(d)['annotations'][0]['questions'])} for d in f.readlines()])
train_data, val_data, test_data = data

In [None]:
!pip install rouge_score

In [None]:
# If there's a GPU available...
if torch.cuda.is_available():

    # Tell PyTorch to use the GPU.
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

In [None]:
print(train_data[0])

In [None]:
from transformers import BartForConditionalGeneration, BartTokenizer

# model = BartForConditionalGeneration.from_pretrained("facebook/bart-large")
tokenizer = BartTokenizer.from_pretrained("facebook/bart-large")
# example_english_phrase = "Unemployment is low because everyone has two jobs. Unemployment is low because people are working 60, 70, 80 hours a week and can barely feed their family."
# batch = tokenizer(example_english_phrase, return_tensors="pt")
# generated_ids = model.generate(batch["input_ids"])
# print(tokenizer.batch_decode(generated_ids, skip_special_tokens=True))
# model.to(device)

In [None]:
from torch.utils.data import TensorDataset, random_split, DataLoader, RandomSampler, SequentialSampler

max_input_length = 64
max_target_length = 128

def preprocess_function(examples, prefix="decompose the claim into subquestions:"):
    claims = [d['claim'] for d in examples]
    questions = [d['questions'] for d in examples]
    inputs = [prefix + doc for doc in claims]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    # The "labels" are the tokenized outputs:
    labels = tokenizer(text_target=questions, max_length=max_target_length, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
# create DataLoaders
from torch.utils.data import Dataset

class CustomDataset(Dataset):
    def __init__(self, dic):
        """
        Args:
            tensor_dataset (TensorDataset): A TensorDataset object containing your data.
        """
        self.dic = dic

    def __len__(self):
        """Returns the total number of samples."""
        return len(self.dic['input_ids'])

    def __getitem__(self, idx):
        """
        Retrieves a sample from the dataset at the given index.
        
        Args:
            idx (int): Index of the sample to retrieve.

        Returns:
            tuple: A tuple containing the data and target for the given index.
        """
        return {'input_ids': self.dic['input_ids'][idx], 'labels': self.dic['labels'][idx], 'attention_mask': self.dic['attention_mask'][idx]}
    

train_dataset = CustomDataset(preprocess_function(train_data))
val_dataset = CustomDataset(preprocess_function(val_data))
preprocess_function(val_data).keys()

In [None]:
import datetime

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))

    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [None]:
from transformers import (
    AutoConfig,
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    DataCollatorForSeq2Seq,
    HfArgumentParser,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    set_seed,
)

model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large")
model

In [None]:
epochs = 6
batch_size = 8
lr = 2e-5

SAVE_PATH = "bart/"
LOGGING_PATH = "bart-logs/"

args = Seq2SeqTrainingArguments(
    output_dir=SAVE_PATH,
    learning_rate=lr,
    do_train = True,
    do_eval = True,
    evaluation_strategy="steps",
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    load_best_model_at_end=True,
    num_train_epochs=epochs,
    predict_with_generate=True,
    generation_max_length=512,
    logging_dir=LOGGING_PATH,
    logging_steps=300,
    save_steps=300,
    report_to="none"
)

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [None]:
import datasets
metric = datasets.load_metric("rouge", trust_remote_code=True)

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    print(predictions[0])
    predictions = np.where(predictions != -100, predictions, tokenizer.pad_token_id)

    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    print("decoded gen",decoded_preds)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Rouge expects a newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]

    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    # Extract a few results
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}

    # Add mean generated length
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

In [None]:
trainer = Seq2SeqTrainer(
    model=model,
    args=args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics)

In [None]:
trainer

In [None]:
trainer.evaluate()

In [None]:
trainer.train()

In [None]:
trainer.evaluate()

In [None]:
trainer.save_model('bart-model/')

In [None]:
!zip -r model.zip /kaggle/working/bart-model

# Evaluation

In [None]:
def generate_output(test_samples, model):
    inputs = tokenizer(
        test_samples,
        max_length=128,
        return_tensors="pt")

    input_ids = inputs.input_ids.to(model.device)
    print(input_ids)
    print(tokenizer.decode(input_ids[0], skip_special_tokens=True))
    attention_mask = inputs.attention_mask.to(model.device)
    outputs = model.generate(input_ids, attention_mask=attention_mask,min_length = 64, max_length = 128, do_sample=True, top_p=0.95, top_k=50)
    output_str = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    return output_str

In [None]:
generate_output("decompose the claim into subquestions:FAKE:  Commandos from Berkut who refused to kneel have been burned alive in Lviv.", model)