In [1]:
from langchain_community.document_loaders import UnstructuredExcelLoader, UnstructuredMarkdownLoader
from langchain_huggingface import HuggingFaceEmbeddings
import faiss
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_community.vectorstores import FAISS
from uuid import uuid4
from langchain_core.documents import Document
from pathlib import Path
from langchain.text_splitter import RecursiveCharacterTextSplitter
import pandas as pd


embed_model_name = "paraphrase-multilingual-MiniLM-L12-v2"
chunk_size = 500
chunk_overlap = 50

data_folder = Path("data")
files = [
    "accomadation.xlsx", "courses.xlsx", "extras.xlsx", "faq.xlsx",
    "fees_and_scholarships.md", "how_to_register.xlsx", "registration_procedures.xlsx",
    "sports_club.xlsx", "student_club.xlsx"
]


In [2]:
docs = []

for file in files:
    file_path = data_folder / file
    if file.endswith('.xlsx'):
        loader = UnstructuredExcelLoader(str(file_path), mode="elements")
    elif file.endswith('.md'):
        loader = UnstructuredMarkdownLoader(str(file_path))
    else:
        print(f"Unsupported file type: {file}")
        continue
    
    try:
        doc = loader.load()
        docs.extend(doc)
    except Exception as e:
        print(f"Error loading {file}: {e}")

print(f"Loaded {len(docs)} documents.")

Loaded 3077 documents.


In [3]:
embeddings = HuggingFaceEmbeddings(model_name=embed_model_name)

_doc_texts = [doc.page_content for doc in docs if hasattr(doc, 'page_content')]
doc_texts = [text for text in _doc_texts if text != "Context"]

# import json

# output_file = "doc_texts.json"

# with open(output_file, "w", encoding="utf-8") as f:
#     json.dump(docs, f, ensure_ascii=False, indent=2)


splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
chunked_texts = [chunk for text in doc_texts for chunk in splitter.split_text(text)]
doc_embeddings = embeddings.embed_documents(chunked_texts)

print(f"Generated {len(doc_embeddings)} embeddings.")


  from tqdm.autonotebook import tqdm, trange
2024-12-14 21:47:25.164852: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-12-14 21:47:25.173140: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1734202045.184354  312453 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1734202045.187652  312453 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-12-14 21:47:25.199193: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow b

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/4.12k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/645 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/471M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Generated 8570 embeddings.


In [10]:
docs

[Document(metadata={'source': 'data/accomadation.xlsx', 'file_directory': 'data', 'filename': 'accomadation.xlsx', 'last_modified': '2024-11-30T11:22:53', 'page_name': 'Sheet1', 'page_number': 1, 'languages': ['eng'], 'filetype': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', 'category': 'Title', 'element_id': 'ac1c7fd80112323025faa39859c8efdd'}, page_content='Context'),
 Document(metadata={'source': 'data/accomadation.xlsx', 'file_directory': 'data', 'filename': 'accomadation.xlsx', 'last_modified': '2024-11-30T11:22:53', 'page_name': 'Sheet1', 'page_number': 1, 'languages': ['eng'], 'filetype': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', 'parent_id': 'ac1c7fd80112323025faa39859c8efdd', 'category': 'NarrativeText', 'element_id': '962af5b8d46e13e3a252f754ef846b80'}, page_content="General Information: There are 19 dormitory buildings run by the METU Directorate of Health, Culture and Sports.\xa0\nMale students can stay in the Dormitories 2,

In [4]:
index = faiss.IndexFlatL2(len(doc_embeddings[0]))

vector_store = FAISS(
    embedding_function=embeddings,
    index=index,
    docstore=InMemoryDocstore(),
    index_to_docstore_id={},
)

documents = [Document(page_content=text) for text in chunked_texts]
uuids = [str(uuid4()) for _ in range(len(documents))]
vector_store.add_documents(documents=documents, ids=uuids)

print(f"FAISS vector store created with {len(documents)} documents.")


FAISS vector store created with 8570 documents.


In [5]:
faiss_local_dir = f"faiss_index_{embed_model_name}"
vector_store.save_local(faiss_local_dir)

In [6]:
new_vector_store = FAISS.load_local(
    faiss_local_dir, embeddings, allow_dangerous_deserialization=True
)

In [7]:
retriever = new_vector_store.as_retriever(search_type="mmr", search_kwargs={"k": 5})
retriever.invoke("wifi ye nasıl bağlanabilirim?" , filter={"source": "news"})

[Document(metadata={}, page_content="FAQ Question: It seems I have a connection to the wireless network but I can't connect to the Internet. What should I do?"),
 Document(metadata={}, page_content='and configure the file as follows (use your own username and password) network={  ssid="eduroam" key_mgmt=WPA-EAP pairwise=AES group=AES eap=TTLS phase2="auth=PAP" anonymous_identity="anonymousmetu.edu.tr" identity="user_namemetu.edu.tr" password="your_password" }  Connect to the wireless network by entering the command edited according to your network adapter: wpa_supplicant -B -i eth2 -c /etc/wpa_supplicant/wpa_supplicant.conf -D wext (for use of wpa_supplicant refer to man wpa_supplicant and'),
 Document(metadata={}, page_content='FAQ Question: What is security level of eduroam?\nFAQ Answer:   Today, the well known security technologies in wireless neotworking are WEP, WPA, WPA2. The security level of WEP and WPA are today very low level. For that reason our setup files come predefined w