In [None]:
%%bash
pip install nltk
pip install datasets
pip install transformers[torch]
pip install tokenizers
pip install evaluate
pip install rouge_score
pip install sentencepiece
pip install huggingface_hub

In [None]:
import nltk
import evaluate
import numpy as np
from datasets import load_dataset
from transformers import T5Tokenizer, DataCollatorForSeq2Seq
from transformers import T5ForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer

In [None]:
# Load the tokenizer, model, and data collator
MODEL_NAME = "google/flan-t5-base"

last_checkpoint = "/content/drive/MyDrive/checkpoint-3500"



tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)
model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME)
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

In [None]:
# Acquire the training data from Hugging Face
DATA_NAME = "sciq"
science_qa = load_dataset(DATA_NAME)

In [None]:
science_qa = science_qa["train"].train_test_split(test_size=0.2)

In [None]:
# Check the length of the data and its structure
science_qa

In [None]:
# We prefix our tasks with "answer the question"
prefix = "Please answer this question: "
suffix = " Support your answer."

# Define the preprocessing function

def preprocess_function(examples):
   """Add prefix to the sentences, tokenize the text, and set the labels"""
   # The "inputs" are the tokenized answer:
   inputs = [prefix + doc for doc in examples["question"]]
   model_inputs = tokenizer(inputs, max_length=128, truncation=True)

   # The "labels" are the tokenized outputs:
  #  outputs = []
  #  for doc, info in zip(examples["correct_answer"], examples["support"]):
  #       outputs.append(doc + '. '+ info)
   labels = tokenizer(text_target=examples["correct_answer"],
                      max_length=512,
                      truncation=True)

   model_inputs["labels"] = labels["input_ids"]
   return model_inputs

In [None]:
# Map the preprocessing function across our dataset
tokenized_dataset = science_qa.map(preprocess_function, batched=True)

In [None]:
nltk.download("punkt", quiet=True)
metric = evaluate.load("rouge")

In [None]:
def compute_metrics(eval_preds):
   preds, labels = eval_preds

   # decode preds and labels
   labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
   decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
   decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

   # rougeLSum expects newline after each sentence
   decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
   decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]

   result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

   return result

In [None]:
# Global Parameters
L_RATE = 3e-4
BATCH_SIZE = 32
PER_DEVICE_EVAL_BATCH = 32
WEIGHT_DECAY = 0.01
SAVE_TOTAL_LIM = 5
NUM_EPOCHS = 4

# Set up training arguments
training_args = Seq2SeqTrainingArguments(
   output_dir="./results",
   evaluation_strategy="epoch",
   learning_rate=L_RATE,
   per_device_train_batch_size=BATCH_SIZE,
   per_device_eval_batch_size=PER_DEVICE_EVAL_BATCH,
   weight_decay=WEIGHT_DECAY,
   save_total_limit=SAVE_TOTAL_LIM,
   num_train_epochs=NUM_EPOCHS,
   predict_with_generate=True,
   push_to_hub=False
)

In [None]:
trainer = Seq2SeqTrainer(
   model=model,
   args=training_args,
   train_dataset=tokenized_dataset["train"],
   eval_dataset=tokenized_dataset["test"],
   tokenizer=tokenizer,
   data_collator=data_collator,
   compute_metrics=compute_metrics
)

In [None]:
trainer.train()

In [None]:
trainer.save_model("/content/drive/MyDrive/train3epoch4")

In [None]:
last_checkpoint = "/content/drive/MyDrive/epoch4"

finetuned_model = T5ForConditionalGeneration.from_pretrained(last_checkpoint)
tokenizer = T5Tokenizer.from_pretrained(last_checkpoint)

In [None]:
my_question = "What is controlled by regulatory proteins that bind to regulatory elements on dna?"
my_question = "Fertilization is the union of a sperm and egg, resulting in the formation of what?"
my_question = "Where do angiosperms produce seeds in flowers?"
my_question = "What is the name of the process by which plants convert light energy into chemical energy?"
my_question = "What is the name of the substance that gives plants their green color?"
my_question = "What is the name of the force that causes objects to fall to the ground?"
my_question = "WWhat is the name of the type of chemical bond that involves the sharing of electrons between atoms?"
my_question = "What is the name of the law that states that the total mass of the reactants in a chemical reaction is equal to the total mass of the products?"
my_question = "What is the name of the process by which a solid substance changes directly into a gas without passing through the liquid state?"
my_question = "What is the name of the smallest particle of an element that retains its chemical properties?"

inputs = "Please answer to this question: " + my_question

In [None]:
inputs

In [None]:
inputs = tokenizer(inputs, return_tensors="pt")
outputs = finetuned_model.generate(**inputs)
answer = tokenizer.decode(outputs[0])
from textwrap import fill

print(fill(answer, width=80))

In [None]:
from google.colab import files
import os

dir_to_zip = '/content/results/checkpoint-3500' #@param {type: "string"}
output_filename = 'results.zip' #@param {type: "string"}
delete_dir_after_download = "No"  #@param ['Yes', 'No']

os.system( "zip -r {} {}".format( output_filename , dir_to_zip ) )

if delete_dir_after_download == "Yes":
    os.system( "rm -r {}".format( dir_to_zip ) )

files.download( output_filename )