# Fine-Tuning T5 for SQL

In [1]:
import os
import torch
import time
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, GenerationConfig, TrainingArguments, Trainer
from transformers import TextDataset, T5ForConditionalGeneration
from datasets import Dataset, DatasetDict, load_dataset, interleave_datasets, concatenate_datasets

2024-10-06 13:58:08.930609: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2, in other operations, rebuild TensorFlow with the appropriate compiler flags.


## Environment
Check settings

In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"        
print(f"Using device: {device}")

Using device: cpu


## Data
Load datasets

In [3]:
dataset_csql_train = load_dataset("b-mc2/sql-create-context", split='train[:8000]')
dataset_csql_test = load_dataset("b-mc2/sql-create-context", split='train[-2000:-1000]')
dataset_csql_validation = load_dataset("b-mc2/sql-create-context", split='train[-1000:]')

dataset_tsql_train = load_dataset("Clinton/Text-to-sql-v1", split='train[:8000]')
dataset_tsql_train = dataset_tsql_train.remove_columns(['source', 'text'])
dataset_tsql_train = dataset_tsql_train.rename_columns({'instruction': 'question', 'input': 'context', 'response': 'answer'})

dataset_tsql_test  = load_dataset("Clinton/Text-to-sql-v1", split='train[-2000:-1000]')
dataset_tsql_test  = dataset_tsql_test.remove_columns(['source', 'text'])
dataset_tsql_test  = dataset_tsql_test.rename_columns({'instruction': 'question', 'input': 'context', 'response': 'answer'})

dataset_tsql_validation   = load_dataset("Clinton/Text-to-sql-v1", split='train[-1000:]')
dataset_tsql_validation   = dataset_tsql_validation.remove_columns(['source', 'text'])
dataset_tsql_validation   = dataset_tsql_validation.rename_columns({'instruction': 'question', 'input': 'context', 'response': 'answer'})

Using the latest cached version of the dataset since b-mc2/sql-create-context couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'default' at /Users/johnmoses/.cache/huggingface/datasets/b-mc2___sql-create-context/default/0.0.0/9d80a6a118b838d9defc3798d659a54a2ac2ff37 (last modified on Mon Sep 16 10:40:01 2024).
Using the latest cached version of the dataset since b-mc2/sql-create-context couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'default' at /Users/johnmoses/.cache/huggingface/datasets/b-mc2___sql-create-context/default/0.0.0/9d80a6a118b838d9defc3798d659a54a2ac2ff37 (last modified on Mon Sep 16 10:40:01 2024).
Using the latest cached version of the dataset since b-mc2/sql-create-context couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'default' at /Users/johnmoses/.cache/huggingface/datasets/b-mc2___sql-create-context/default/0.0.0/9d80a6a118b838d9defc3798d659a54a2

In [7]:
dataset_train_merged = concatenate_datasets(
    [
        dataset_csql_train, 
        dataset_tsql_train
        ]
    )
dataset_test_merged = concatenate_datasets(
    [
        dataset_csql_test, 
        dataset_tsql_test
        ]
    )
dataset_validation_merged = concatenate_datasets(
    [
        dataset_csql_validation, 
        dataset_tsql_validation
        ]
    )

In [8]:
dataset_train_merged.to_csv('train_merged.csv', index=False)
dataset_test_merged.to_csv('test_merged.csv', index=False)
dataset_validation_merged.to_csv('validation_merged.csv', index=False)

Creating CSV from Arrow format:   0%|          | 0/16 [00:00<?, ?ba/s]

Creating CSV from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Creating CSV from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

1259325

In [9]:
dataset_train_merged

Dataset({
    features: ['context', 'question', 'answer'],
    num_rows: 16000
})

In [10]:
dataset_test_merged

Dataset({
    features: ['context', 'question', 'answer'],
    num_rows: 2000
})

In [11]:
dataset_validation_merged

Dataset({
    features: ['context', 'question', 'answer'],
    num_rows: 2000
})

In [12]:
dataset = load_dataset('csv', data_files={
    "train": "train_merged.csv", 
    "test": "test_merged.csv", 
    "validation": "validation_merged.csv"
    })

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

In [13]:
dataset

DatasetDict({
    train: Dataset({
        features: ['context', 'question', 'answer'],
        num_rows: 16000
    })
    test: Dataset({
        features: ['context', 'question', 'answer'],
        num_rows: 2000
    })
    validation: Dataset({
        features: ['context', 'question', 'answer'],
        num_rows: 2000
    })
})

In [14]:
dataset['test'][0]

{'context': 'CREATE TABLE table_name_94 (round VARCHAR, event VARCHAR)',
 'question': 'Which round has pain and glory 2006 as the event?',
 'answer': 'SELECT round FROM table_name_94 WHERE event = "pain and glory 2006"'}

## Model and Tokenizer
Define configuration settings

In [16]:
model_name='google/flan-t5-small'
# model_name='t5-small'
os.environ['TOKENIZERS_PARALLELISM'] = 'true' 

In [17]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
base_model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
base_model = base_model.to(device)

In [18]:
tokenizer = AutoTokenizer.from_pretrained(model_name)



Preprocess datasets

In [19]:
def tokenize_function(sample):
    start_prompt = "Tables:\n"
    middle_prompt = "\n\nQuestion:\n"
    end_prompt = "\n\nAnswer:\n"

    data_zip = zip(sample['context'], sample['question'])
    prompt = [start_prompt + context + middle_prompt + question + end_prompt for context, question in data_zip]
    sample['input_ids'] = tokenizer(prompt, padding="max_length", truncation=True, return_tensors="pt").input_ids
    sample['labels'] = tokenizer(sample['answer'], padding="max_length", truncation=True, return_tensors="pt").input_ids
    return sample

In [20]:
tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(['question','context','answer'])


Map:   0%|          | 0/16000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [21]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 16000
    })
    test: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 2000
    })
    validation: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 2000
    })
})

In [22]:
tokenized_datasets.keys()

dict_keys(['train', 'test', 'validation'])

In [23]:
tokenized_datasets['train'][0].keys()

dict_keys(['input_ids', 'labels'])

In [24]:
tokenized_datasets['train'][0]['input_ids'][:10]

[4398, 7, 10, 205, 4386, 6048, 332, 17098, 819, 41]

In [25]:
tokenized_datasets['train'][0]['labels'][:10]

[3, 23143, 14196, 2847, 17161, 599, 1935, 61, 21680, 819]

In [26]:
tokenized_datasets['train'].shape

(16000, 2)

In [27]:
tokenized_datasets['validation'].shape

(2000, 2)

In [28]:
tokenized_datasets['test'].shape

(2000, 2)

Test model with zero shot prediction/inferencing

In [29]:
index = 0

question = dataset['test'][index]['question']
context = dataset['test'][index]['context']
answer = dataset['test'][index]['answer']

prompt = f"""Tables:
{context}

Question:
{question}

Answer:
"""

inputs = tokenizer(prompt, return_tensors='pt')
inputs = inputs.to('cpu')

output = tokenizer.decode(
    base_model.generate(
        inputs["input_ids"], 
        max_new_tokens=200,
    )[0], 
    skip_special_tokens=True
)

dash_line = '-'.join('' for x in range(100))
print(dash_line)
print(f'INPUT PROMPT:\n{prompt}')
print(dash_line)
print(f'BASELINE HUMAN ANSWER:\n{answer}\n')
print(dash_line)
print(f'MODEL GENERATION - ZERO SHOT:\n{output}')

---------------------------------------------------------------------------------------------------
INPUT PROMPT:
Tables:
CREATE TABLE table_name_94 (round VARCHAR, event VARCHAR)

Question:
Which round has pain and glory 2006 as the event?

Answer:

---------------------------------------------------------------------------------------------------
BASELINE HUMAN ANSWER:
SELECT round FROM table_name_94 WHERE event = "pain and glory 2006"

---------------------------------------------------------------------------------------------------
MODEL GENERATION - ZERO SHOT:
94


Fine tune

In [30]:
finetuned_model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
finetuned_model = finetuned_model.to('cpu')
tokenizer = AutoTokenizer.from_pretrained(model_name)



In [32]:
output_dir = 'flan-t5-text2sql-log'

training_args = TrainingArguments(
    output_dir=output_dir,
    overwrite_output_dir=True,
    learning_rate=5e-3,
    num_train_epochs=2,
    per_device_train_batch_size=16,     # batch size per device during training
    per_device_eval_batch_size=16,      # batch size for evaluation
    weight_decay=0.01,
    logging_steps=50,
    evaluation_strategy='steps',        # evaluation strategy to adopt during training
    eval_steps=500,                  
)

trainer = Trainer(
    model=finetuned_model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
)

Load model from local folder

In [34]:
trainer.evaluate()

  0%|          | 0/125 [00:00<?, ?it/s]

{'eval_loss': 53.72148895263672,
 'eval_model_preparation_time': 0.0026,
 'eval_runtime': 84.8302,
 'eval_samples_per_second': 23.577,
 'eval_steps_per_second': 1.474}

In [35]:
%%time

trainer.train()

  0%|          | 0/2000 [00:00<?, ?it/s]

{'loss': 2.6362, 'grad_norm': 0.1208856850862503, 'learning_rate': 0.004875, 'epoch': 0.05}
{'loss': 0.2077, 'grad_norm': 0.16979654133319855, 'learning_rate': 0.00475, 'epoch': 0.1}
{'loss': 0.1865, 'grad_norm': 0.0690625011920929, 'learning_rate': 0.004625000000000001, 'epoch': 0.15}
{'loss': 0.161, 'grad_norm': 0.10531901568174362, 'learning_rate': 0.0045000000000000005, 'epoch': 0.2}
{'loss': 0.1549, 'grad_norm': 0.10077470541000366, 'learning_rate': 0.004375, 'epoch': 0.25}
{'loss': 0.1506, 'grad_norm': 0.09236154705286026, 'learning_rate': 0.00425, 'epoch': 0.3}
{'loss': 0.1494, 'grad_norm': 0.12148416042327881, 'learning_rate': 0.004125, 'epoch': 0.35}
{'loss': 0.1527, 'grad_norm': 0.11464022845029831, 'learning_rate': 0.004, 'epoch': 0.4}
{'loss': 0.1483, 'grad_norm': 0.22886568307876587, 'learning_rate': 0.0038750000000000004, 'epoch': 0.45}
{'loss': 0.1424, 'grad_norm': 0.07426823675632477, 'learning_rate': 0.00375, 'epoch': 0.5}


  0%|          | 0/125 [00:00<?, ?it/s]

{'eval_loss': 0.1329299360513687, 'eval_model_preparation_time': 0.0026, 'eval_runtime': 85.7269, 'eval_samples_per_second': 23.33, 'eval_steps_per_second': 1.458, 'epoch': 0.5}
{'loss': 0.1408, 'grad_norm': 0.17206546664237976, 'learning_rate': 0.0036249999999999998, 'epoch': 0.55}
{'loss': 0.1313, 'grad_norm': 0.06367618590593338, 'learning_rate': 0.0034999999999999996, 'epoch': 0.6}
{'loss': 0.1383, 'grad_norm': 0.16437524557113647, 'learning_rate': 0.0033750000000000004, 'epoch': 0.65}
{'loss': 0.1267, 'grad_norm': 0.08070875704288483, 'learning_rate': 0.0032500000000000003, 'epoch': 0.7}
{'loss': 0.1303, 'grad_norm': 0.1410449594259262, 'learning_rate': 0.003125, 'epoch': 0.75}
{'loss': 0.1271, 'grad_norm': 0.10282577574253082, 'learning_rate': 0.003, 'epoch': 0.8}
{'loss': 0.1282, 'grad_norm': 0.1529235988855362, 'learning_rate': 0.002875, 'epoch': 0.85}
{'loss': 0.1255, 'grad_norm': 0.06291164457798004, 'learning_rate': 0.0027500000000000003, 'epoch': 0.9}
{'loss': 0.1265, 'grad

  0%|          | 0/125 [00:00<?, ?it/s]

{'eval_loss': 0.11903002858161926, 'eval_model_preparation_time': 0.0026, 'eval_runtime': 85.4367, 'eval_samples_per_second': 23.409, 'eval_steps_per_second': 1.463, 'epoch': 1.0}
{'loss': 0.1161, 'grad_norm': 0.11356052756309509, 'learning_rate': 0.002375, 'epoch': 1.05}
{'loss': 0.1123, 'grad_norm': 0.0594414547085762, 'learning_rate': 0.0022500000000000003, 'epoch': 1.1}
{'loss': 0.1075, 'grad_norm': 0.09283460676670074, 'learning_rate': 0.002125, 'epoch': 1.15}
{'loss': 0.1114, 'grad_norm': 0.12232553958892822, 'learning_rate': 0.002, 'epoch': 1.2}
{'loss': 0.1087, 'grad_norm': 0.07208883762359619, 'learning_rate': 0.001875, 'epoch': 1.25}
{'loss': 0.1099, 'grad_norm': 0.10732711851596832, 'learning_rate': 0.0017499999999999998, 'epoch': 1.3}
{'loss': 0.1079, 'grad_norm': 0.05342360585927963, 'learning_rate': 0.0016250000000000001, 'epoch': 1.35}
{'loss': 0.1089, 'grad_norm': 0.10596264898777008, 'learning_rate': 0.0015, 'epoch': 1.4}
{'loss': 0.1071, 'grad_norm': 0.080254852771759

  0%|          | 0/125 [00:00<?, ?it/s]

{'eval_loss': 0.11296973377466202, 'eval_model_preparation_time': 0.0026, 'eval_runtime': 85.5308, 'eval_samples_per_second': 23.383, 'eval_steps_per_second': 1.461, 'epoch': 1.5}
{'loss': 0.1013, 'grad_norm': 0.058875277638435364, 'learning_rate': 0.0011250000000000001, 'epoch': 1.55}
{'loss': 0.1009, 'grad_norm': 0.05465039610862732, 'learning_rate': 0.001, 'epoch': 1.6}
{'loss': 0.1038, 'grad_norm': 0.0664030909538269, 'learning_rate': 0.0008749999999999999, 'epoch': 1.65}
{'loss': 0.1029, 'grad_norm': 0.08152158558368683, 'learning_rate': 0.00075, 'epoch': 1.7}
{'loss': 0.1012, 'grad_norm': 0.05129268765449524, 'learning_rate': 0.000625, 'epoch': 1.75}
{'loss': 0.0974, 'grad_norm': 0.07165384292602539, 'learning_rate': 0.0005, 'epoch': 1.8}
{'loss': 0.0976, 'grad_norm': 0.051256585866212845, 'learning_rate': 0.000375, 'epoch': 1.85}
{'loss': 0.0986, 'grad_norm': 0.061065372079610825, 'learning_rate': 0.00025, 'epoch': 1.9}
{'loss': 0.1008, 'grad_norm': 0.07675628364086151, 'learnin

  0%|          | 0/125 [00:00<?, ?it/s]

{'eval_loss': 0.10892447084188461, 'eval_model_preparation_time': 0.0026, 'eval_runtime': 85.0228, 'eval_samples_per_second': 23.523, 'eval_steps_per_second': 1.47, 'epoch': 2.0}
{'train_runtime': 13097.2173, 'train_samples_per_second': 2.443, 'train_steps_per_second': 0.153, 'train_loss': 0.18727583980560303, 'epoch': 2.0}
CPU times: user 31min 24s, sys: 16min 16s, total: 47min 40s
Wall time: 3h 38min 17s


TrainOutput(global_step=2000, training_loss=0.18727583980560303, metrics={'train_runtime': 13097.2173, 'train_samples_per_second': 2.443, 'train_steps_per_second': 0.153, 'total_flos': 5948496150528000.0, 'train_loss': 0.18727583980560303, 'epoch': 2.0})

In [36]:
finetuned_model.save_pretrained("flan-t5-text2sql")

In [37]:
tokenizer.save_pretrained("flan-t5-text2sql")

('flan-t5-text2sql/tokenizer_config.json',
 'flan-t5-text2sql/special_tokens_map.json',
 'flan-t5-text2sql/tokenizer.json')

In [39]:
model_path = "flan-t5-text2sql"
finetuned_model = AutoModelForSeq2SeqLM.from_pretrained(model_path)
finetuned_model = finetuned_model.to(device)

finetuned_tokenizer = AutoTokenizer.from_pretrained(model_path)

Test fine-tuned model with zero shot inferencing

In [40]:
index = 1
# index = len(dataset['test'])-200

question = dataset['test'][index]['question']
context = dataset['test'][index]['context']
answer = dataset['test'][index]['answer']

prompt = f"""Tables:
{context}

Question:
{question}

Answer:
"""

inputs = tokenizer(prompt, return_tensors='pt')
inputs = inputs.to('cpu')

output = tokenizer.decode(
    finetuned_model.generate(
        inputs["input_ids"], 
        max_new_tokens=200,
    )[0], 
    skip_special_tokens=True
)

dash_line = '-'.join('' for x in range(100))
print(dash_line)
print(f'INPUT PROMPT:\n{prompt}')
print(dash_line)
print(f'BASELINE HUMAN ANSWER:\n{answer}\n')
print(dash_line)
print(f'FINE-TUNED MODEL - ZERO SHOT:\n{output}')

---------------------------------------------------------------------------------------------------
INPUT PROMPT:
Tables:
CREATE TABLE table_name_14 (record VARCHAR, opponent VARCHAR)

Question:
Which record has john flemming as the opponent?

Answer:

---------------------------------------------------------------------------------------------------
BASELINE HUMAN ANSWER:
SELECT record FROM table_name_14 WHERE opponent = "john flemming"

---------------------------------------------------------------------------------------------------
FINE-TUNED MODEL - ZERO SHOT:
SELECT COUNT(DISTINCT demographic.subject_id) FROM demographic INNER JOIN diagnoses ON demographic.hadm_id = diagnoses.hadm_id WHERE demographic.admission_location = "HOME HEALTH CARE" AND diagnoses.long_title = "Acute glomerulas"


Test fine-tuned model with query

In [41]:
def get_sql(query):
    prompt = "translate English to SQL: %s " % query
    features = finetuned_tokenizer([prompt], return_tensors='pt')
    output = finetuned_model.generate(
        input_ids=features['input_ids'],
        max_new_tokens=200
        )
    return finetuned_tokenizer.decode(output[0], skip_special_tokens=True)

In [42]:
query = "Which record has john flemming as the opponent?"
get_sql(query)

'SELECT COUNT(DISTINCT demographic.subject_id) FROM demographic INNER JOIN diagnoses ON demographic.hadm_id = diagnoses.hadm_id WHERE demographic.admission_location = "HOME HEALTH CARE" AND diagnoses.long_title = "Acute glomerulasis"'

In [43]:
for i in range(10,15, 1):
    print('Question: ' + dataset['test'][i]['question'])
    print('Predict. :' + get_sql(dataset['test'][i]['question']))
    print('Expected: ' + dataset['test'][i]['answer'])
    print('=================================\n')

Question: How many weeks did the single that entered the charts 14 september 2002 stay on the charts ?
Predict. :SELECT COUNT(DISTINCT demographic.subject_id) FROM demographic INNER JOIN diagnoses ON demographic.hadm_id = diagnoses.hadm_id WHERE demographic.admission_location = "HOME HEALTH CARE" AND diagnoses.long_title = "Acute glomerulasis"
Expected: SELECT COUNT(weeks_on_chart__uk_) FROM table_name_30 WHERE entered_chart__uk_ = "14 september 2002"

Question: Which single was on the Charts for 23 weeks ?
Predict. :SELECT COUNT(DISTINCT demographic.subject_id) FROM demographic INNER JOIN diagnoses ON demographic.hadm_id = diagnoses.hadm_id WHERE demographic.admission_location = "HOME HEALTH CARE" AND diagnoses.long_title = "Acute glomerulasis"
Expected: SELECT title FROM table_name_50 WHERE weeks_on_chart__uk_ = 23

Question: How many ECTS credit points occur with Master in Management?
Predict. :SELECT COUNT(DISTINCT demographic.subject_id) FROM demographic INNER JOIN diagnoses ON de

Evaluate

In [44]:
import evaluate
import pandas as pd

In [45]:
questions = dataset['test'][0:10]['question']
contexts = dataset['test'][0:10]['context']
human_answers = dataset['test'][0:10]['answer']

base_model_answers = []
finetuned_model_answers = []

for idx, question in enumerate(questions):
    prompt = f"""Tables:
    {contexts[idx]}

    Question:
    {question}

    Answer:
    """
    
    input_ids = tokenizer(prompt, return_tensors='pt').input_ids
    input_ids = input_ids.to(device)

    human_text_output = human_answers[idx]

    base_model_outputs = base_model.generate(input_ids=input_ids,generation_config=GenerationConfig(max_new_tokens=200))
    base_model_output = tokenizer.decode(base_model_outputs[0], skip_special_tokens=True)
    base_model_answers.append(base_model_output)

    finetuned_model_outputs = finetuned_model.generate(input_ids=input_ids,generation_config=GenerationConfig(max_new_tokens=200))
    finetuned_model_output = tokenizer.decode(finetuned_model_outputs[0], skip_special_tokens=True)
    finetuned_model_answers.append(finetuned_model_output)

In [46]:
zipped_summaries = list(zip(human_answers, base_model_answers, finetuned_model_answers))
df = pd.DataFrame(zipped_summaries, columns=['human_answers','base_model','finetuned_model'])
df

Unnamed: 0,human_answers,base_model,finetuned_model
0,SELECT round FROM table_name_94 WHERE event = ...,94,SELECT COUNT(DISTINCT demographic.subject_id) ...
1,SELECT record FROM table_name_14 WHERE opponen...,a slam,SELECT COUNT(DISTINCT demographic.subject_id) ...
2,SELECT record FROM table_name_49 WHERE time = ...,5:00,SELECT COUNT(DISTINCT demographic.subject_id) ...
3,"SELECT score FROM table_name_37 WHERE home = ""...",0,SELECT COUNT(DISTINCT demographic.subject_id) ...
4,"SELECT home FROM table_name_93 WHERE score = ""...",st. louis,SELECT COUNT(DISTINCT demographic.subject_id) ...
5,SELECT engine FROM table_name_85 WHERE entrant...,VARCHAR,SELECT COUNT(DISTINCT demographic.subject_id) ...
6,SELECT chassis FROM table_name_39 WHERE year <...,a styrofoam,SELECT COUNT(DISTINCT demographic.subject_id) ...
7,SELECT chassis FROM table_name_5 WHERE entrant...,VARCHAR,SELECT COUNT(DISTINCT demographic.subject_id) ...
8,SELECT MIN(attendance) FROM table_name_3 WHERE...,0,SELECT COUNT(DISTINCT demographic.subject_id) ...
9,SELECT MIN(attendance) FROM table_name_7 WHERE...,0,SELECT COUNT(DISTINCT demographic.subject_id) ...


Compute ROUGE score for subset of data

In [47]:
# Load ROUGE
rouge = evaluate.load('rouge')

# Select dataset
answers = dataset['test'][0:10]['answer']

base_model_results = rouge.compute(
    predictions=base_model_answers,
    references=answers,
    use_aggregator=True,
    use_stemmer=True,
)
print('Base Model:\n',base_model_results)

finetuned_model_results = rouge.compute(
    predictions=finetuned_model_answers,
    references=answers,
    use_aggregator=True,
    use_stemmer=True,
)
print('Fine-tuned model:\n',finetuned_model_results)

Base Model:
 {'rouge1': 0.03643724696356275, 'rouge2': 0.011764705882352941, 'rougeL': 0.03643724696356275, 'rougeLsum': 0.03643724696356275}
Fine-tuned model:
 {'rouge1': 0.17220223055447104, 'rouge2': 0.0, 'rougeL': 0.16323084176175495, 'rougeLsum': 0.163230841761755}


ROUGE score for flan t5
Base Model:
 {'rouge1': 0.03643724696356275, 'rouge2': 0.011764705882352941, 'rougeL': 0.03643724696356275, 'rougeLsum': 0.03643724696356275}
Fine-tuned model:
 {'rouge1': 0.17220223055447104, 'rouge2': 0.0, 'rougeL': 0.16323084176175495, 'rougeLsum': 0.163230841761755}
