In [109]:
from langchain.prompts import PromptTemplate
from langchain.llms import OpenAI
from langchain_openai import ChatOpenAI
from dotenv import load_dotenv
import os
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.document_loaders import TextLoader
from langchain.schema import HumanMessage
load_dotenv()

True

In [110]:
llm = ChatOpenAI(api_key= os.getenv("OPENAI_API_KEY"), model="gpt-3.5-turbo")
model_path = "./sentence-camembert-large"

# Model and encoding configurations
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': False}

# Initialize the embedding model
embedding_model = HuggingFaceEmbeddings(
    model_name=model_path,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

In [111]:
# Function to split text into chunks
def split_paragraphs(rawText):
    text_splitter = CharacterTextSplitter(
        chunk_size=200,
        chunk_overlap=50
    )
    return text_splitter.split_text(rawText)

# Function to load PDFs and split text into chunks
def load_txt_file(txt_path):
    text_chunks = []    
    loader = TextLoader(txt_path)
    documents = loader.load()
    for doc in documents:
        chunks = split_paragraphs(doc.page_content)
        text_chunks.extend(chunks)
    return text_chunks

In [112]:
txt_path = "base_competence.txt"
base_chunks = load_txt_file(txt_path)

Created a chunk of size 447, which is longer than the specified 200
Created a chunk of size 332, which is longer than the specified 200
Created a chunk of size 570, which is longer than the specified 200


Created a chunk of size 241, which is longer than the specified 200
Created a chunk of size 501, which is longer than the specified 200
Created a chunk of size 752, which is longer than the specified 200
Created a chunk of size 384, which is longer than the specified 200
Created a chunk of size 1648, which is longer than the specified 200
Created a chunk of size 281, which is longer than the specified 200
Created a chunk of size 509, which is longer than the specified 200
Created a chunk of size 233, which is longer than the specified 200
Created a chunk of size 1310, which is longer than the specified 200
Created a chunk of size 436, which is longer than the specified 200
Created a chunk of size 655, which is longer than the specified 200


In [113]:
store = FAISS.from_texts(base_chunks, embedding_model)
store.save_local("faiss_index")

In [114]:
db = FAISS.load_local("faiss_index", embedding_model, allow_dangerous_deserialization=True)
query = "INRAE"
k = 4
docs = db.similarity_search_with_score(query, fetch_k=k)
sorted_docs = sorted(docs, key=lambda x: x[1], reverse=True)
for i in range(k):
    print(sorted_docs[i])

(Document(page_content="Compétences\nTECHNIQUES : NLP, RAG, LLM,  analyse de données, reporting,, deep learning, machine learning, classification d'images, plan de test\nDOMAINES FONCTIONNELS : système d'information, transport, science et recherche, formation professionnelle\nMÉTHODOLOGIE DE GESTION DE PROJET : Agile Scrum\nOUTILS : Power BI, Alteryx, Elasticsearch, Git, Microsoft Azure DevOps, Postman, SoapUI\nLANGAGES DE DÉVELOPPEMENT : Python, SQL, Node.js\nBASES DE DONNÉES : MySQL, PostgreSQL, Redis\nPLATEFORMES : Google Cloud Platform, Microsoft Azure Databricks\nLANGUES : Anglais (Courant)"), 122.55694)
(Document(page_content='Caisse des dépôts et de Consignation • Data Engineer • depuis 03/2024 (3 mois)\nClub Data mc2i • Data Analyst • depuis 10/2022 (1 an et 8 mois)\nIle-De-France Mobilités • Analytics Engineer • 11/2022 à 03/2024 (1 an et 5 mois)\nPwC France • Data Scientist • 09/2021 à 08/2022 (1 an)\nINRAE • Data Scientist • 04/2021 à 08/2021 (5 mois)'), 116.35309)
(Document

In [115]:
def custom_prompt(retriever, query):
    results = retriever.similarity_search(query, k=3)
    source_knowledge = " ".join([x.page_content for x in results])
    augment_prompt = f"""Tu es un assistant et tu dois repondre aux questions qui te sont posées, en utilisant le contexte suivant: \n\nContexte :\n{source_knowledge}\n\nQuestion : \n{query}"""
    return augment_prompt

In [120]:

query = "cites mois les langages que HTI utilise"
prompt = [
    HumanMessage(content=custom_prompt(db, query))]
res = llm.invoke(prompt)
print(res.content)

HTI utilise principalement les langages de programmation tels que Python, SQL et R dans le cadre de son travail en data analytics, genAI et l'automatisation des tests fonctionnels. Il a également une bonne maîtrise en traitement de langage naturel (NLP) pour la mise en place de chatbots basés sur la technologie RAG et LLM.
