

# Install necessary dependencies

In [1]:
!pip install -q accelerate -U
!pip install -q -U datasets
!pip install -q scipy
!pip install -q ipywidgets
!pip install -q wandb
!pip install -q transformers
!pip install -q torch
!pip install -q sentencepiece
!pip install -q tqdm
!pip install -q evaluate
!pip install -q rouge_score

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m270.9/270.9 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m507.1/507.1 kB[0m [31m13.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m17.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m20.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m21.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m28.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m38.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m196.4/196.4 kB[0m [31m29.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━

 # Import dependencies

In [None]:
# Data libraries
from datasets import load_dataset, Dataset, concatenate_datasets, list_metrics, load_metric

# preprocessing Libraries
import numpy as np
import pandas as pd
from datetime import datetime
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import nltk
from nltk.tokenize import sent_tokenize
nltk.download("punkt")

# Model libraries
import transformers
import ast
import torch
import sklearn
from transformers import (
    T5Tokenizer, T5TokenizerFast, T5ForConditionalGeneration,
    AutoTokenizer, AutoModelForCausalLM, AutoModelForSeq2SeqLM,
    DefaultDataCollator, T5ForQuestionAnswering, DataCollatorForSeq2Seq, pipeline, AutoModelForQuestionAnswering, BertTokenizer, BertForQuestionAnswering
)

# Metrics library
import evaluate
from evaluate import evaluator

# Visualization library
import wandb

# Warning libraries
import warnings
warnings.filterwarnings('ignore')

# Utility functions
import t5_utils as utils


# **Data Visualization**

In [3]:
data_raw = load_dataset("samsum")
print(data_raw)

Downloading data:   0%|          | 0.00/6.06M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/347k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/335k [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 14732
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 819
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 818
    })
})


In [4]:
data_raw['train'][0]

{'id': '13818513',
 'dialogue': "Amanda: I baked  cookies. Do you want some?\r\nJerry: Sure!\r\nAmanda: I'll bring you tomorrow :-)",
 'summary': 'Amanda baked cookies and will bring Jerry some tomorrow.'}

# Global variables

In [5]:
DATASET = "samsum"
PREFIX = "summarize"
INPUT_COL = "dialogue"
OUTPUT_COL = "summary"
MODEL = "google/flan-t5-base"
TRAIN_SPLIT = "train"
VALIDATION_SPLIT = "validation"
TEST_SPLIT = "test"



# Training args
MAX_STEPS = 2000
PER_DEVICE_TRAIN_BATCH_SIZE = 32
PER_DEVICE_EVAL_BATCH_SIZE = 32
SAVE_STEPS = 100
EVAL_STEPS = 100
LOGGING_STEPS = 100
CHECKPOINT_SAVED = 100
OPTIMIZER = "adamw_torch"
# adamw_hf, adamw_torch, adamw_torch_fused, adamw_apex_fused, adamw_anyprecision or adafactor.
LEARNING_RATE = 2.5e-5
COLLATOR = "seq2seq" # seq2seq or default

# for compute metrics
TASK = "text2text-generation"
METRICS = "rouge"


This two variables are defined and override from utility file to avoid error in training. (error : name "tokenizer" is not defined)

In [6]:
utils.metric = evaluate.load(METRICS)
utils.tokenizer = T5Tokenizer.from_pretrained(MODEL)

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


This function takes data from HuggingFace or local data, split ( train, test, validation, train[:slicing] ), input_col, target_col, prefix(column or text)

In [7]:
train_data = utils.prepare_data(
                            DATASET,
                            TRAIN_SPLIT,
                            INPUT_COL,
                            OUTPUT_COL,
                            PREFIX)
print(train_data)

Dataset({
    features: ['input_text', 'target_text'],
    num_rows: 14732
})


In [8]:
eval_data = utils.prepare_data(
                           DATASET,
                           VALIDATION_SPLIT,
                           INPUT_COL,
                           OUTPUT_COL,
                           PREFIX)

print(eval_data)

Dataset({
    features: ['input_text', 'target_text'],
    num_rows: 818
})


In [10]:
test_data = utils.prepare_data(
                           DATASET,
                           TEST_SPLIT,
                           INPUT_COL,
                           OUTPUT_COL,
                           PREFIX)
print(test_data)

Dataset({
    features: ['input_text', 'target_text'],
    num_rows: 819
})


In [10]:
max_source_length, max_target_length = utils.identify_max_lengths(
                                                                  train_data,
                                                                  eval_data,
                                                                  model = MODEL
                                                                  )

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Map:   0%|          | 0/15550 [00:00<?, ? examples/s]

Map:   0%|          | 0/15550 [00:00<?, ? examples/s]

In [11]:
print(f"Input max length : {max_source_length}")
print("--------------------------------")
print(f"Target max length : {max_target_length}")

Input max length : 512
--------------------------------
Target max length : 94


In [12]:
train_data_tokenized, eval_data_tokenized = utils.preprocessed_data(
                                                                    train_data,
                                                                    eval_data,
                                                                    max_source_length,
                                                                    max_target_length,
                                                                    model = MODEL
                                                                    )

Map:   0%|          | 0/14732 [00:00<?, ? examples/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make s

Map:   0%|          | 0/818 [00:00<?, ? examples/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


# Training

Look at utility functions to get in-depth understanding of training function

In [15]:
utils.train_model(
           train_data_tokenized,
           eval_data_tokenized,
           project = "Finetunning-t5-Summarization3",
           model = MODEL,
           base_model_name = "T5",
           max_steps = MAX_STEPS,
           per_device_train_batch_size = PER_DEVICE_TRAIN_BATCH_SIZE,
           per_device_eval_batch_size = PER_DEVICE_EVAL_BATCH_SIZE,
           save_steps = SAVE_STEPS,
           eval_steps = EVAL_STEPS,
           logging_steps = LOGGING_STEPS,
           checkpoint_saved = CHECKPOINT_SAVED,
           optimizer = OPTIMIZER,
           learning_rate = LEARNING_RATE,
           collator = COLLATOR
           )

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Step,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
100,1.4673,1.408862,24.6162,13.4058,22.7956,23.8356,96.0
200,1.4568,1.405079,24.4844,13.3715,22.748,23.7347,96.0
300,1.4512,1.402324,24.4754,13.3903,22.7623,23.6767,96.0


Step,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
100,1.4673,1.408862,24.6162,13.4058,22.7956,23.8356,96.0
200,1.4568,1.405079,24.4844,13.3715,22.748,23.7347,96.0
300,1.4512,1.402324,24.4754,13.3903,22.7623,23.6767,96.0
400,1.4564,1.396963,24.521,13.4307,22.8242,23.8215,96.0
500,1.4425,1.400815,24.6122,13.4725,22.8557,23.8653,96.0
600,1.4095,1.395784,24.6781,13.5334,22.9261,23.9372,96.0
700,1.4168,1.392057,24.7888,13.6283,23.0192,24.0553,96.0
800,1.4066,1.391497,24.7734,13.6842,23.0375,24.027,96.0
900,1.4138,1.389334,24.6976,13.588,22.9691,23.9532,96.0
1000,1.395,1.38661,24.7356,13.6367,22.9929,24.0004,96.0


# **Evaluation**

In [None]:
formatted_results = utils.computing_metrics_for_test("summarization",
                                      model ="/content/drive/MyDrive/T5-Finetunning-t5-Summarization3/checkpoint-2000",
                                      metrics="rouge",
                                      test_data=test_data)

In [None]:
print("Testing Results of finetuned model:",formatted_results)

Testing Results of finetuned model: {'rouge1': '45.90%', 'rouge2': '22.95%', 'rougeL': '36.61%', 'rougeLsum': '36.62%', 'total_time_in_seconds': '631.25', 'samples_per_second': '1.30', 'latency_in_seconds': '0.77'}


In [None]:
formatted_results_pretrain = utils.computing_metrics_for_test("summarization",
                                               model ="google/flan-t5-base",
                                               metrics="rouge",
                                               test_data=test_data)


In [None]:
print("Testing Results of pretrained model:",formatted_results_pretrain)

Testing Results of pretrained model: {'rouge1': '45.15%', 'rouge2': '21.99%', 'rougeL': '35.70%', 'rougeLsum': '35.73%', 'total_time_in_seconds': '633.30', 'samples_per_second': '1.29', 'latency_in_seconds': '0.77'}


# **Inference from pretrained model**

---



In [25]:
dialogue= test_data[1]['input_text']
print(f"dialogue: \n{dialogue}\n---------------")

# summarize dialogue
result =utils.inference('Summarize ',
                dialogue,
                fined_tuned_path="google/flan-t5-base")


print(f"flan-t5-base summary:\n{result}")



dialogue: 
summarize : Eric: MACHINE!
Rob: That's so gr8!
Eric: I know! And shows how Americans see Russian ;)
Rob: And it's really funny!
Eric: I know! I especially like the train part!
Rob: Hahaha! No one talks to the machine like that!
Eric: Is this his only stand-up?
Rob: Idk. I'll check.
Eric: Sure.
Rob: Turns out no! There are some of his stand-ups on youtube.
Eric: Gr8! I'll watch them now!
Rob: Me too!
Eric: MACHINE!
Rob: MACHINE!
Eric: TTYL?
Rob: Sure :)
---------------


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


flan-t5-base summary:
Eric and Rob are watching a stand-up on youtube.


# **Inference from Finetuned Flant5-base model**

In [26]:
dialogue= test_data[1]['input_text']
print(f"dialogue: \n{dialogue}\n---------------")

# summarize dialogue
result =utils.inference('Summarize ',
                dialogue,
                fined_tuned_path="/content/drive/MyDrive/T5-Finetunning-t5-Summarization3/checkpoint-2000")


print(f"flan-t5-base summary:\n{result}")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


dialogue: 
summarize : Eric: MACHINE!
Rob: That's so gr8!
Eric: I know! And shows how Americans see Russian ;)
Rob: And it's really funny!
Eric: I know! I especially like the train part!
Rob: Hahaha! No one talks to the machine like that!
Eric: Is this his only stand-up?
Rob: Idk. I'll check.
Eric: Sure.
Rob: Turns out no! There are some of his stand-ups on youtube.
Eric: Gr8! I'll watch them now!
Rob: Me too!
Eric: MACHINE!
Rob: MACHINE!
Eric: TTYL?
Rob: Sure :)
---------------
flan-t5-base summary:
Eric and Rob are watching a stand-up on youtube.
