In [20]:
import os
import torch
import numpy as np

from trl import SFTTrainer
from datasets import Dataset
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM,TrainingArguments

# 1. Import config

In [25]:
import yaml

with open("config.yaml", 'r') as stream:
    try:
        config = yaml.safe_load(stream)
    except yaml.YAMLError as exc:
        print(exc)

# 2. Import data

In [22]:
from utils import import_data_from_json

device = torch.device('mps')

airbus_datapath = os.path.join("./data/", "airbus_helicopters_train_set.json")
train_dataset, val_dataset, test_dataset = import_data_from_json(airbus_datapath)

model_name = "google/flan-t5-base"
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

#setting padding instructions for tokenizer
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

# 3. Fine tuning 

In [26]:
# Create the trainer
from utils import prompt_instruction_format

trainingArgs = TrainingArguments(**config['parameters_ft'])

peft_config = LoraConfig(**config['parameters_LoRA'])

trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset = val_dataset,
    peft_config=peft_config,
    tokenizer=tokenizer,
    packing=True,
    formatting_func=prompt_instruction_format,
    args=trainingArgs,
    max_seq_length=512
)

trainer.train()

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

  0%|          | 0/70 [00:00<?, ?it/s]

RuntimeError: MPS backend out of memory (MPS allocated: 17.90 GB, other allocations: 157.53 MB, max allowed: 18.13 GB). Tried to allocate 192.00 MB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).

# 4. Test et similarité sBERT

In [None]:
def predict_from_model(model, legal_text, device): 

    model.eval()
    model.to(device)
    
    prompt = f"""
    Summarize the following legal text.

    {legal_text}

    Summary:
    """

    inputs = tokenizer(prompt, return_tensors='pt')
    inputs = {key: value.to(device) for key, value in inputs.items()}

    with torch.no_grad(): 
        output_ids = model.generate(**inputs)
        output = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    
    return output, prompt

device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")

index = 1

legal_text = test_dataset['original_text'][index]
summary = test_dataset['reference_summary'][index]

output, prompt = predict_from_model(model, legal_text, device)

dash_line = '-'.join('' for x in range(100))
print(dash_line)
print(f'INPUT PROMPT:\n{prompt}')
print(dash_line)
print(f'BASELINE HUMAN SUMMARY:\n{summary}\n')
print(dash_line)
print(f'MODEL GENERATION - ZERO SHOT:\n{output}')

---------------------------------------------------------------------------------------------------
INPUT PROMPT:

    Summarize the following legal text.

    Technical Data of the Customer’s Helicopter shall be provided by the Seller in English.

    Summary:
    
---------------------------------------------------------------------------------------------------
BASELINE HUMAN SUMMARY:
Technical Data are provided in English.

---------------------------------------------------------------------------------------------------
MODEL GENERATION - ZERO SHOT:
The Customer’s Helicopter shall be provided with technical data of the Customer’s He


In [None]:
from rouge_score import rouge_scorer
from sentence_transformers import SentenceTransformer, util

device = torch.device('mps')
semantic_model = SentenceTransformer('all-MiniLM-L6-v2')

def compute_rouge(test_dataset : Dataset, model) -> dict[str, dict[str, str]]:
    scores = []
    metrics = ['rouge1', 'rouge2', 'rougeL']
    scorer = rouge_scorer.RougeScorer(
        metrics,
        use_stemmer=True
    )
        
    for i in range(len(test_dataset)):
        example = test_dataset[i]  

        original_text = example['original_text']
        reference_summary = example['reference_summary']
        generated_summary, _ = predict_from_model(model, original_text, device)

        scores.append(
              scorer.score(
                    generated_summary,
                    reference_summary
              )
        )

    final_scores = {
        metric: {
            "precision": str(np.mean(
                [score.get(metric).precision for score in scores]
            )),
            "recall": str(np.mean(
                [score.get(metric).recall for score in scores]
            )),
            "fmeasure": str(np.mean(
                [score.get(metric).fmeasure for score in scores]
            )),
            }
        for metric in metrics
    }
    return final_scores

def compute_similarity_scores(test_dataset : Dataset, model) -> dict[str, dict[str, str]]:
    scores_with_reference = []
    scores_with_original_text = []

    for i in range(len(test_dataset)):
        example = test_dataset[i]  

        original_text = example['original_text']
        reference_summary = example['reference_summary']
        generated_summary, _ = predict_from_model(model, original_text, device)

        reference_embeddings = semantic_model.encode(
            reference_summary,
            convert_to_tensor=True
        )
        original_text_embeddings = semantic_model.encode(
            original_text,
            convert_to_tensor=True
        )
        generated_embeddings = semantic_model.encode(
            generated_summary,
            convert_to_tensor=True
        )

        scores_with_reference.append(
            util.cos_sim(
                reference_embeddings,
                generated_embeddings).cpu()
        )
        scores_with_original_text.append(
            util.cos_sim(
                original_text_embeddings,
                generated_embeddings).cpu()
        )
            
    final_scores = {
        "similarity_with_reference_summary": {
            "mean":   str(np.mean(scores_with_reference)),
            "median": str(np.median(scores_with_reference)),
            "std":    str(np.std(scores_with_reference))
        },
        "similarity_with_original_text": {
            "mean":   str(np.mean(scores_with_original_text)),
            "median": str(np.median(scores_with_original_text)),
            "std":    str(np.std(scores_with_original_text))
        }
    }
    return final_scores


def evaluate(test_dataset : Dataset, model):
    model.eval()
    return {
        "Rouge": compute_rouge(test_dataset, model),
        "Similarity": compute_similarity_scores(test_dataset, model)
    }

performance_metrics = evaluate(test_dataset, model)
print(performance_metrics)



{'Rouge': {'rouge1': {'precision': '0.33373947998609704', 'recall': '0.5656671798478522', 'fmeasure': '0.37287924055321575'}, 'rouge2': {'precision': '0.19779041111828605', 'recall': '0.32981260880420543', 'fmeasure': '0.2171983821513787'}, 'rougeL': {'precision': '0.28986354281565446', 'recall': '0.4957283496149042', 'fmeasure': '0.32417701327042575'}}, 'Similarity': {'similarity_with_reference_summary': {'mean': '0.62909263', 'median': '0.69479036', 'std': '0.25737685'}, 'similarity_with_original_text': {'mean': '0.6864568', 'median': '0.7865524', 'std': '0.24224243'}}}


In [None]:
def final_score(dict_performance : dict):
    score = 0

    performance_rouge = dict_performance['Rouge']
    performance_similarity = dict_performance['Similarity']

    for metric in performance_rouge.values():
        score += float(metric['precision'])

    for metric in performance_similarity.values(): 
        score += float(metric['mean'])

    return score / (len(performance_rouge)+len(performance_similarity))

score = final_score(performance_metrics)
print(score)

0.42738857278400744
