In [153]:
from dotenv import load_dotenv
from langchain.text_splitter import CharacterTextSplitter, MarkdownTextSplitter
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.document_loaders import TextLoader
from langchain_community.document_loaders.markdown import UnstructuredMarkdownLoader
import os
load_dotenv()

True

In [154]:
model_path = "Lajavaness/sentence-camembert-large"

# Model and encoding configurations
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': False}

# Initialize the embedding model
embedding_model = HuggingFaceEmbeddings(
    model_name=model_path,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

In [155]:
# Function to split text into chunks
def split_paragraphs(text, type_text):
    splitter = {
            "txt" : CharacterTextSplitter,
            "md" : MarkdownTextSplitter
    }
    text_splitter = splitter[type_text](
        chunk_size=1500,
        chunk_overlap=0
    )
    return text_splitter.split_text(text)

# Function to load PDFs and split text into chunks
def load_file(file_path, file_type):
    Loader = {
            "txt" : TextLoader,
            "md" : UnstructuredMarkdownLoader
    }
    text_chunks = []    
    loader = Loader[file_type](file_path)
    documents = loader.load()
    for doc in documents:
        chunks = split_paragraphs(doc.page_content, file_type)
        text_chunks.extend(chunks)
    return text_chunks

In [156]:
'''txt_path = "base_competence.txt"
file_type = "txt"'''

txt_path = "base_competence.md"
file_type = "md"
base_chunks = load_file(txt_path, file_type)
store = FAISS.from_texts(base_chunks, embedding_model)
store.save_local("chatbot/faiss_index")

In [None]:
db = FAISS.load_local("chatbot/faiss_index", embedding_model, allow_dangerous_deserialization=True)
query = "quels sont les compétences de ..."
docs = db.similarity_search_with_score(query, fetch_k=4)
for i in range(len(docs)):
    print(docs[i][0])