# Bibliothèques

In [118]:
from getpass import getpass
import os
from langchain_community.llms import HuggingFaceEndpoint
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate

## Chemin vers le fichier de données

In [117]:
data_folder_path = 'data/'

## Token pour connexion avec Hugging Face

In [119]:
HUGGINGFACEHUB_API_TOKEN = getpass()
os.environ["HUGGINGFACEHUB_API_TOKEN"] = HUGGINGFACEHUB_API_TOKEN

## Appel de l'API de Mistral-7B-Instruct-v0.2

In [124]:
repo_id = "mistralai/Mistral-7B-Instruct-v0.2"

llm = HuggingFaceEndpoint(
    repo_id=repo_id, max_length=128, temperature=0.3, token=HUGGINGFACEHUB_API_TOKEN
)

                    max_length was transferred to model_kwargs.
                    Please make sure that max_length is what you intended.
                    token was transferred to model_kwargs.
                    Please make sure that token is what you intended.


Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /Users/jskaf/.cache/huggingface/token
Login successful

Answer: The capital city of France is Paris. Paris is one of the most famous cities in the world and is known for its iconic landmarks such as the Eiffel Tower, Louvre Museum, Notre-Dame Cathedral, and the Arc de Triomphe. It is also home to numerous cafes, restaurants, and shops, making it a popular tourist destination. Paris has a rich history and is considered a cultural, artistic, and intellectual center of Europe.


## Exemple de prompt

In [123]:
question = "What is the Capital of France? "

template = """Question: {question}"""

prompt = PromptTemplate.from_template(template)

In [None]:
llm_chain = LLMChain(prompt=prompt, llm=llm)
print(llm_chain.run(question))

# Partie RAG

### Préparation des données au format .csv

In [128]:
import pandas as pd

df = pd.read_json(data_folder_path + "airbus_helicopters_train_set.json")
df = df.T
df

Unnamed: 0,original_text,reference_summary,uid
train_sum01,These general Standard Conditions of Sale appl...,These terms and conditions apply as soon as th...,train_sum01
train_sum010,Each Party represents to the other as at the d...,Each Party represents that the other is not a ...,train_sum010
train_sum0100,"All living, travelling and accommodation expen...","Expenses relating to the travelling, living an...",train_sum0100
train_sum0101,"Unless otherwise specified in the Contract, th...","Unless otherwise specified in the Contract, th...",train_sum0101
train_sum0102,Reasonable insurance coverage of risks arising...,Reasonable insurance coverage of risks arising...,train_sum0102
...,...,...,...
train_sum095,No term or provision hereof will be considered...,"No term, provision or breach shall be waived o...",train_sum095
train_sum096,Any variation or modification of the Contract ...,Any modification to the contract shall be put ...,train_sum096
train_sum097,The relationship between the Parties is solely...,No joint venture or partnership is intended no...,train_sum097
train_sum098,"The Customer shall not be entitled, without th...",Unless the Seller agrees to it through writing...,train_sum098


### Suppression des "anomalies" (résumé plus grand que le texte original)

In [129]:
indices_to_remove = ['train_sum0279', 'train_sum0317']

# Remove rows based on specified indices
df = df.drop(indices_to_remove, axis=0)
df

Unnamed: 0,original_text,reference_summary,uid
train_sum01,These general Standard Conditions of Sale appl...,These terms and conditions apply as soon as th...,train_sum01
train_sum010,Each Party represents to the other as at the d...,Each Party represents that the other is not a ...,train_sum010
train_sum0100,"All living, travelling and accommodation expen...","Expenses relating to the travelling, living an...",train_sum0100
train_sum0101,"Unless otherwise specified in the Contract, th...","Unless otherwise specified in the Contract, th...",train_sum0101
train_sum0102,Reasonable insurance coverage of risks arising...,Reasonable insurance coverage of risks arising...,train_sum0102
...,...,...,...
train_sum095,No term or provision hereof will be considered...,"No term, provision or breach shall be waived o...",train_sum095
train_sum096,Any variation or modification of the Contract ...,Any modification to the contract shall be put ...,train_sum096
train_sum097,The relationship between the Parties is solely...,No joint venture or partnership is intended no...,train_sum097
train_sum098,"The Customer shall not be entitled, without th...",Unless the Seller agrees to it through writing...,train_sum098


In [130]:
df_reset = df.reset_index()
df_reset = df_reset.drop(["index","uid"], axis = 1)
df_reset

Unnamed: 0,index,original_text,reference_summary,uid
0,train_sum01,These general Standard Conditions of Sale appl...,These terms and conditions apply as soon as th...,train_sum01
1,train_sum010,Each Party represents to the other as at the d...,Each Party represents that the other is not a ...,train_sum010
2,train_sum0100,"All living, travelling and accommodation expen...","Expenses relating to the travelling, living an...",train_sum0100
3,train_sum0101,"Unless otherwise specified in the Contract, th...","Unless otherwise specified in the Contract, th...",train_sum0101
4,train_sum0102,Reasonable insurance coverage of risks arising...,Reasonable insurance coverage of risks arising...,train_sum0102
...,...,...,...,...
406,train_sum095,No term or provision hereof will be considered...,"No term, provision or breach shall be waived o...",train_sum095
407,train_sum096,Any variation or modification of the Contract ...,Any modification to the contract shall be put ...,train_sum096
408,train_sum097,The relationship between the Parties is solely...,No joint venture or partnership is intended no...,train_sum097
409,train_sum098,"The Customer shall not be entitled, without th...",Unless the Seller agrees to it through writing...,train_sum098


### Sauvegarde des données au format .csv

In [133]:
df_reset.to_csv(data_folder_path + "airbus_helicopters_train_set.csv")

## Partie Langchain Agent

In [None]:
from langchain_community.document_loaders.csv_loader import CSVLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
import re

In [198]:
df_test = pd.read_csv(data_folder_path + "airbus_helicopters_train_set.csv")

In [None]:
embeddings = HuggingFaceEmbeddings()

### Création des documents surlesquels retrieve

In [135]:
def create_db_data() -> FAISS:
    global embeddings
    loader = CSVLoader(file_path=data_folder_path + "airbus_helicopters_train_set.csv")
    transcript = loader.load()
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
    docs = text_splitter.split_documents(transcript)
    db = FAISS.from_documents(docs, embeddings)
    return db


create_db_data()


<langchain_community.vectorstores.faiss.FAISS at 0x2de8e3550>

### Fonction qui résume à partir du texte original et du retrieval 

In [186]:
def get_summary_original_text(db, original_text,  k=1):
    global llm

    docs = db.similarity_search(original_text, k=k)
    docs_page_content = " ".join([d.page_content for d in docs])

    # Use regular expression to remove ": {number}"
    cleaned_text = re.sub(r': \d{1,3}', '', docs_page_content)
    # Remove leading and trailing whitespaces
    cleaned_text = cleaned_text.strip()

    print(cleaned_text)

    prompt = PromptTemplate(
        input_variables=["original_text", "docs"],
        template="""
        You are a helpful legal assistant that that can summarize legal text.
        
        Summarize the following original text : {original_text}, in a clear paragraph. You can use the following documents to help you write in the same tone: {docs}. Speak in a professional tone, as if you were writing a legal document.
        Do not mention the text. Be brief and shorter than the original text. 

        """,
    )
    
    chain = LLMChain(llm=llm, prompt=prompt)
    response = chain.run(original_text = original_text, docs = cleaned_text)
    response = response.replace("\n", "")
    return response, docs

### Fonction qui améliore un résumé proposé

In [193]:
def get_summary_enhanced(db, summary, original_text,  k=1):
    global llm

    docs = db.similarity_search(summary, k=k)
    docs_page_content = " ".join([d.page_content for d in docs])

    # Use regular expression to remove ": {number}"
    cleaned_text = re.sub(r': \d{1,3}', '', docs_page_content)
    # Remove leading and trailing whitespaces
    cleaned_text = cleaned_text.strip()

    print(cleaned_text)

    prompt = PromptTemplate(
        input_variables=["summary","original_text", "docs"],
        template="""
        You are a helpful legal assistant that that can summarize legal text.

        You have been asked to enhance the following summary: {summary}. The original text is: {original_text}. You can use the following documents to help you write in the same tone: {docs}. 
        Speak in a professional tone, as if you were writing a legal document.
        Do not mention the text. Be brief and shorter than the original text. 

        """,
    )
    
    chain = LLMChain(llm=llm, prompt=prompt)
    response = chain.run(summary = summary, original_text = original_text, docs = cleaned_text)
    response = response.replace("\n", "")
    return response, docs

#### ID du texte surlquel on veut améliorer/générer le résumé

In [187]:
num_text = 20

#### Exemple de summary

In [188]:
db = create_db_data()
original_text, summary = df_test["original_text"].iloc[num_text], df_test["reference_summary"].iloc[num_text]
summary

'If the Customer fails to fulfil these administrative conditions before the start of the training session, the Seller may postpone or cancel the Training Services at Customer’s expenses and apply provisions of articles E2-3 and/or Article 14.1.1 of the SCS.'

### Génération de résumé

In [189]:
response, docs = get_summary_original_text(db, original_text)

original_text: Should the Customer fail to fulfil the administrative conditions before the start of the training session, the Seller reserves the right to postpone said session at Customer expense or cancel it and apply the provisions defined under article E2-3 and/or article 14.1.1 of SCS.
reference_summary: If the Customer fails to fulfil these administrative conditions before the start of the training session, the Seller may postpone or cancel the Training Services at Customer’s expenses and apply provisions of articles E2-3 and/or Article 14.1.1 of the SCS.


In [190]:
response

" If the customer does not meet the administrative requirements prior to the training session, the seller has the right to either postpone the session at the customer's expense or cancel it altogether. In such cases, the provisions outlined in articles E2-3 and/or Article 14.1.1 of the SCS will apply."

### Amélioration de résumé

In [194]:
enhanced_summary, docs = get_summary_enhanced(db, response, original_text)

original_text: Should the Customer fail to fulfil the administrative conditions before the start of the training session, the Seller reserves the right to postpone said session at Customer expense or cancel it and apply the provisions defined under article E2-3 and/or article 14.1.1 of SCS.
reference_summary: If the Customer fails to fulfil these administrative conditions before the start of the training session, the Seller may postpone or cancel the Training Services at Customer’s expenses and apply provisions of articles E2-3 and/or Article 14.1.1 of the SCS.


In [195]:
enhanced_summary

'If the customer fails to meet administrative requirements before the training session, the seller may postpone or cancel at customer expense and apply provisions from articles E2-3 and 14.1.1.'