# Install necessary dependencies

In [1]:
!pip install accelerate -U
!pip install -q -U datasets
!pip install scipy
!pip install ipywidgets
!pip install wandb
!pip install transformers
!pip install torch
!pip install sentencepiece
!pip install tqdm
!pip install evaluate
!pip install rouge_score
!pip install huggingface_hub

Collecting accelerate
  Downloading accelerate-0.26.1-py3-none-any.whl (270 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/270.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━[0m [32m174.1/270.9 kB[0m [31m5.0 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m270.9/270.9 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.26.1
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m507.1/507.1 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m15.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m14.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m13.7

 # Import dependencies

In [2]:
# Data libraries
from datasets import load_dataset, Dataset, concatenate_datasets, list_metrics, load_metric

# preprocessing Libraries
import numpy as np
import pandas as pd
from datetime import datetime
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import nltk
from nltk.tokenize import sent_tokenize
nltk.download("punkt")

# Model libraries
import transformers
import ast
import torch
import sklearn
from transformers import (
    T5Tokenizer, T5TokenizerFast, T5ForConditionalGeneration,
    AutoTokenizer, AutoModelForCausalLM, AutoModelForSeq2SeqLM,
    DefaultDataCollator, T5ForQuestionAnswering, DataCollatorForSeq2Seq, pipeline, AutoModelForQuestionAnswering, BertTokenizer, BertForQuestionAnswering
)

# Metrics library
import evaluate
from evaluate import evaluator

# Visualization library
import wandb

# Warning libraries
import warnings
warnings.filterwarnings('ignore')

# Utility functions
import t5_utils as utils


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Global variables

In [3]:
DATASET = "squad"
PREFIX = "context"
INPUT_COL = "question"
OUTPUT_COL = "answers"
MODEL = "google/flan-t5-base"
TRAIN_SPLIT = "train[:20000]"
VALIDATION_SPLIT = "train[20000:25000]"
TEST_SPLIT = "validation[:5000]"



# Training args
MAX_STEPS = 1000
PER_DEVICE_TRAIN_BATCH_SIZE = 16
PER_DEVICE_EVAL_BATCH_SIZE = 16
SAVE_STEPS = 100
EVAL_STEPS = 100
LOGGING_STEPS = 100
CHECKPOINT_SAVED = 100
OPTIMIZER = "adamw_torch"
# adamw_hf, adamw_torch, adamw_torch_fused, adamw_apex_fused, adamw_anyprecision or adafactor.
LEARNING_RATE = 2.5e-5
COLLATOR = "seq2seq" # seq2seq or default

# for compute metrics
TASK = "text2text-generation"
METRICS = "rouge"


This two variables are defined and override from utility file to avoid error in training. (error : name "tokenizer" is not defined)

In [4]:
utils.metric = evaluate.load(METRICS)
utils.tokenizer = T5Tokenizer.from_pretrained(MODEL)

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


This function takes data from HuggingFace or local data, split ( train, test, validation, train[:slicing] ), input_col, target_col, prefix(column or text)

In [5]:
train_data = utils.get_data(
                            DATASET,
                            TRAIN_SPLIT
                            )

Downloading readme:   0%|          | 0.00/7.83k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.82M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/87599 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10570 [00:00<?, ? examples/s]

In [6]:
eval_data = utils.get_data(
                           DATASET,
                           VALIDATION_SPLIT
                           )

In [7]:
test_data = utils.get_data(
                           DATASET,
                           TEST_SPLIT
                           )

In [8]:
train_df = pd.DataFrame(train_data)
eval_df = pd.DataFrame(eval_data)
test_df = pd.DataFrame(test_data)

In [9]:
train_df["input_text"] = train_df["context"] + " : " + train_df["question"].astype(str)
eval_df["input_text"] = eval_df["context"] + " : " + eval_df["question"].astype(str)
test_df["input_text"] = test_df["context"] + " : " + test_df["question"].astype(str)

In [10]:
train_df['target_text'] = train_df['answers'].apply(lambda x: x['text'][0])
eval_df['target_text'] = eval_df['answers'].apply(lambda x: x['text'][0])
test_df['target_text'] = test_df['answers'].apply(lambda x: x['text'][0])

In [11]:
train_df.drop(["id", "title", "context", "question", "answers"], axis=1, inplace=True)
eval_df.drop(["id", "title", "context", "question", "answers"], axis=1, inplace=True)
test_df.drop(["id", "title", "context", "question", "answers"], axis=1, inplace=True)

In [12]:
train_data = Dataset.from_pandas(train_df)
eval_data = Dataset.from_pandas(eval_df)
test_data = Dataset.from_pandas(test_df)

In [13]:
max_source_length, max_target_length = utils.identify_max_lengths(
                                                                  train_data,
                                                                  eval_data,
                                                                  model = MODEL
                                                                  )

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

In [14]:
print(f"Input max length : {max_source_length}")
print("--------------------------------")
print(f"Target max length : {max_target_length}")

Input max length : 512
--------------------------------
Target max length : 76


In [15]:
train_data_tokenized, eval_data_tokenized = utils.preprocessed_data(
                                                                    train_data,
                                                                    eval_data,
                                                                    max_source_length,
                                                                    max_target_length,
                                                                    model = MODEL
                                                                    )

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make s

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


# Training

toketttttttttttLook at utility functions to get in-depth understanding of training function

In [16]:
utils.train_model(
           train_data_tokenized,
           train_data_tokenized,
           project = "Finetunning-t5",
           model = MODEL,
           base_model_name = "T5",
           max_steps = MAX_STEPS,
           per_device_train_batch_size = PER_DEVICE_TRAIN_BATCH_SIZE,
           per_device_eval_batch_size = PER_DEVICE_EVAL_BATCH_SIZE,
           save_steps = SAVE_STEPS,
           eval_steps = EVAL_STEPS,
           logging_steps = LOGGING_STEPS,
           checkpoint_saved = CHECKPOINT_SAVED,
           optimizer = OPTIMIZER,
           learning_rate = LEARNING_RATE,
           collator = COLLATOR
           )

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Step,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
100,0.3695,0.197222,11.6632,6.9578,11.4869,11.4816,80.0
200,0.3228,0.195965,11.4674,6.8213,11.303,11.2968,80.0
300,0.2967,0.190907,11.6041,6.8802,11.4259,11.4281,80.0
400,0.3145,0.184958,11.6518,6.9238,11.4892,11.4884,80.0
500,0.2912,0.188442,11.5353,6.9179,11.3639,11.3609,80.0
600,0.2823,0.191774,11.4337,6.8558,11.2622,11.2641,80.0
700,0.2777,0.184592,11.4771,6.8749,11.3173,11.3156,80.0
800,0.2894,0.180945,11.508,6.8614,11.3484,11.3471,80.0
900,0.2772,0.181505,11.4919,6.8495,11.3313,11.329,80.0
1000,0.284,0.181323,11.498,6.8605,11.334,11.3334,80.0


In [17]:
context = """
Super Bowl 50 was an American football game to determine the champion of the National Football League (NFL) for the 2015 season.
The American Football Conference (AFC) champion Denver Broncos defeated the National Football Conference (NFC) champion Carolina Panthers 24–10 to earn their third Super Bowl title.
The game was played on February 7, 2016, at Levi's Stadium in the San Francisco Bay Area at Santa Clara, California. As this was the 50th Super Bowl, the league emphasized the "golden anniversary" with various gold-themed initiatives,
as well as temporarily suspending the tradition of naming each Super Bowl game with Roman numerals (under which the game would have been known as "Super Bowl L"),
so that the logo could prominently feature the Arabic numerals 50.
"""
text = "Which NFL team represented the AFC at Super Bowl 50?"

In [18]:
utils.inference(
         context,
         text,
         fined_tuned_path = "/content/T5-Finetunning-t5/checkpoint-1000"
         )

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


'Denver Broncos'

# Push tokenizer and model to hub

In [24]:
tokenizer = T5Tokenizer.from_pretrained("/content/T5-Finetunning-t5/checkpoint-1000")
model = T5ForConditionalGeneration.from_pretrained("/content/T5-Finetunning-t5/checkpoint-1000")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [25]:
tokenizer.push_to_hub("HuzaifaHPC/T5_SQUAD")
model.push_to_hub("HuzaifaHPC/T5_SQUAD")

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/HuzaifaHPC/T5_SQUAD/commit/f6bde5c30be731317c9bf3771f76db88632fb531', commit_message='Upload T5ForConditionalGeneration', commit_description='', oid='f6bde5c30be731317c9bf3771f76db88632fb531', pr_url=None, pr_revision=None, pr_num=None)