In [1]:
import os
import torch
import numpy as np
import pandas as pd

from trl import SFTTrainer
from datasets import Dataset
from peft import LoraConfig
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM,TrainingArguments

# 1. Import config

In [2]:
import yaml

with open("config.yaml", 'r') as stream:
    try:
        config = yaml.safe_load(stream)
    except yaml.YAMLError as exc:
        print(exc)

# 2. Import data

In [None]:
from utils import import_data_from_json

device = torch.device('mps')

# airbus_datapath = os.path.join("./data/", "airbus_helicopters_train_set.json")
# train_dataset, val_dataset, test_dataset = import_data_from_json(airbus_datapath)

model_name = "google-t5/t5-small"
# model_name = "facebook/bart-base"
# model_name = "HuggingFaceH4/zephyr-7b-beta"
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)
model.save_pretrained("./weight/google-t5/t5-small")

# 3. Fine tuning 

In [4]:
# Create the trainer
from utils import prompt_instruction_format_t5, prompt_instruction_format_bart

trainingArgs = TrainingArguments(**config['parameters_ft'])

peft_config = LoraConfig(**config['parameters_LoRA'])

trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset = val_dataset,
    peft_config=peft_config,
    tokenizer=tokenizer,
    packing=True,
    # formatting_func=prompt_instruction_format_bart, 
    formatting_func=prompt_instruction_format_t5,
    args=trainingArgs,
    max_seq_length=512
)

trainer.train()

'NoneType' object has no attribute 'cadam32bit_grad_fp32'


  warn("The installed version of bitsandbytes was compiled without GPU support. "


Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


  0%|          | 0/250 [00:00<?, ?it/s]

Checkpoint destination directory output_ft/checkpoint-25 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory output_ft/checkpoint-50 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory output_ft/checkpoint-75 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory output_ft/checkpoint-100 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory output_ft/checkpoint-125 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory output_ft/checkpoint-150 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory output_ft/checkpoint-175 already exists and is non-empty. Saving will proceed but saved results may 

{'train_runtime': 357.5158, 'train_samples_per_second': 2.797, 'train_steps_per_second': 0.699, 'train_loss': 0.5465741577148437, 'epoch': 10.0}


TrainOutput(global_step=250, training_loss=0.5465741577148437, metrics={'train_runtime': 357.5158, 'train_samples_per_second': 2.797, 'train_steps_per_second': 0.699, 'train_loss': 0.5465741577148437, 'epoch': 10.0})

# 4. Test 

In [8]:
def predict_from_model(model, legal_text, device): 

    model.eval()
    model.to(device)
    
    # prompt = f"""
    # Summarize the following legal text.

    # {legal_text}

    # Summary:
    # """

    prompt = f"""
    {legal_text}
    """

    inputs = tokenizer(prompt, return_tensors='pt')
    inputs = {key: value.to(device) for key, value in inputs.items()}

    with torch.no_grad(): 
        output_ids = model.generate(**inputs)
        output = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    
    return output, prompt

device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")

index = 1

legal_text = test_dataset['original_text'][index]
summary = test_dataset['reference_summary'][index]

output, prompt = predict_from_model(model, legal_text, device)

dash_line = '-'.join('' for _ in range(100))
print(dash_line)
print(f'INPUT PROMPT:\n{prompt}')
print(dash_line)
print(f'BASELINE HUMAN SUMMARY:\n{summary}\n')
print(dash_line)
print(f'MODEL GENERATION - ZERO SHOT:\n{output}')

---------------------------------------------------------------------------------------------------
INPUT PROMPT:

    Risks of loss or damage to the Equipment shall pass: From the Purchaser to the Supplier upon delivery as per Article 5, after the signature of the Reception Document; From the Supplier to the Purchaser upon return as per Article 6, after the signature of the Return Document.
    
---------------------------------------------------------------------------------------------------
BASELINE HUMAN SUMMARY:
Risk of loss or damage to the Equipment shall pass: From the Purchaser to the Supplier upon delivery, From the Supplier to the Purchaser upon return

---------------------------------------------------------------------------------------------------
MODEL GENERATION - ZERO SHOT:
Risks of loss or damage to the Equipment shall pass: From the Purchaser to the Supplier


# 6. Custom class 

In [2]:
import yaml

from loguru import logger
from utils import compute_similarity_scores_text

class TextSummarizer(): 
    def __init__(self, config_file_path):
        with open(config_file_path, 'r') as file:
            try:    
                self.config = yaml.safe_load(file)
            except yaml.YAMLError as exc:
                print(exc)
        logger.info("Config info loaded")

        # With cuda
        # self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        # With MPS (Mac Silicon)
        self.device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
        
        ### Partie LLM Summarization
        self.n_brut_force = self.config['parameters_textsum_archi']['n_brut_force']
        models_params = self.config['models']['models_params']
        self.models_no_RAG = self.create_models_out_of_RAG(models_params, self.device)
        logger.info("Models loaded")

        ### Partie LLM + RAG
        datapath = self.config['dbRAG']['datapath']
        # Load RAG with LangChain

    @staticmethod
    def create_models_out_of_RAG(models_params : list[tuple], device): 
        list_models = []

        for tokenizer_name, model_path in models_params:

            model = AutoModelForSeq2SeqLM.from_pretrained(model_path).to(device)
            model.eval()

            tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, trust_remote_code=True)
            tokenizer.pad_token = tokenizer.eos_token
            tokenizer.padding_side = "right"

            list_models.append((tokenizer, model))
        
        return list_models
    
    def __call__(self, text_to_summarize):
        return self.predict_no_RAG(text_to_summarize)

    def predict_no_RAG(self, text_to_summarize):
        list_output = []

        for tokenizer, model in self.models_no_RAG:
            model.eval()
            model.to(self.device)

            prompt = f"""
            {text_to_summarize}
            """
                
            inputs = tokenizer(prompt, return_tensors='pt')
            inputs = {key: value.to(self.device) for key, value in inputs.items()}

            list_output_model = [] 
            for _ in range(self.n_brut_force): 
                with torch.no_grad(): 
                    output_ids = model.generate(**inputs)
                    output = tokenizer.decode(output_ids[0], skip_special_tokens=True)

                sim_score_output = compute_similarity_scores_text(text_to_summarize, output)

                if sim_score_output>0.85: 
                    return (sim_score_output, output)
                
                list_output_model.append((sim_score_output, output))

            list_output.append(max(list_output_model, key=lambda x: x[0]))

        return max(list_output, key=lambda x: x[0])
    
    def predict_with_RAG(self, text_to_summarize): 
        sim_score_output, output = self.predict_no_RAG(text_to_summarize)

        if sim_score_output > 0.85: 
            return output
        
        else: 
            ### Partie LLM + RAG : Enrichissement du résumé
            pass


# 7. Evaluation custom_class FT

In [None]:
from utils import import_data_from_json

airbus_datapath = os.path.join("./data/", "airbus_helicopters_train_set.json")
train_dataset, val_dataset, test_dataset = import_data_from_json(airbus_datapath)

text_summarizer = TextSummarizer(config_file_path="config.yaml")

In [None]:
from utils import evaluate

score_rouge, score_sim, perf_dict = evaluate(test_dataset, text_summarizer)

print(pd.DataFrame(perf_dict['Rouge']))
print(pd.DataFrame(perf_dict['Similarity']))
print(f"Score rouge is {score_rouge}")
print(f"Score sim is {score_sim}")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


                        rouge1               rouge2               rougeL
precision  0.19112413139873094  0.11657675657675659  0.17368588695819817
recall      0.5599999999999999   0.3333333333333333                0.495
fmeasure      0.27005772005772  0.16266322136089578  0.24355459355459352
       similarity_with_reference_summary similarity_with_original_text
mean                          0.55679625                    0.65063244
median                         0.6943444                     0.6770458
std                           0.20050384                   0.066222034
Score rouge is 0.2254251783244031
Score sim is 0.58807498
