In [1]:
!pip install pandas langchain langchain-community sentence-transformers faiss-cpu "transformers[agents]"

Collecting pandas
  Using cached pandas-2.2.2-cp312-cp312-win_amd64.whl.metadata (19 kB)
Collecting langchain
  Downloading langchain-0.2.9-py3-none-any.whl.metadata (6.9 kB)
Collecting langchain-community
  Using cached langchain_community-0.2.7-py3-none-any.whl.metadata (2.5 kB)
Collecting sentence-transformers
  Downloading sentence_transformers-3.0.1-py3-none-any.whl.metadata (10 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.8.0.post1-cp312-cp312-win_amd64.whl.metadata (3.8 kB)
Collecting transformers[agents]
  Using cached transformers-4.42.4-py3-none-any.whl.metadata (43 kB)
Collecting numpy>=1.26.0 (from pandas)
  Using cached numpy-2.0.0-cp312-cp312-win_amd64.whl.metadata (60 kB)
Collecting pytz>=2020.1 (from pandas)
  Using cached pytz-2024.1-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Using cached tzdata-2024.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting PyYAML>=5.3 (from langchain)
  Using cached PyYAML-6.0.1-cp312-cp312-win_amd

In [2]:
pip install "git+https://github.com/huggingface/transformers.git#egg=transformers[agents]"

Collecting transformers (from transformers[agents])
  Cloning https://github.com/huggingface/transformers.git to c:\users\anoop maurya\appdata\local\temp\pip-install-p5ltffma\transformers_1b345546c487435cbe4ce3ded1b7fe75
  Resolved https://github.com/huggingface/transformers.git to commit 271fd8e60d26b1773321c8b01fb67ed831fd3494
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Building wheels for collected packages: transformers
  Building wheel for transformers (pyproject.toml): started
  Building wheel for transformers (pyproject.toml): finished with status 'error'
Failed to build transformers
Note: you may need to restart the kernel to use updated packages.


DEPRECATION: git+https://github.com/huggingface/transformers.git#egg=transformers[agents] contains an egg fragment with a non-PEP 508 name pip 25.0 will enforce this behaviour change. A possible replacement is to use the req @ url syntax, and remove the egg fragment. Discussion can be found at https://github.com/pypa/pip/issues/11617
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers.git 'C:\Users\Anoop Maurya\AppData\Local\Temp\pip-install-p5ltffma\transformers_1b345546c487435cbe4ce3ded1b7fe75'
  error: subprocess-exited-with-error
  
  × Building wheel for transformers (pyproject.toml) did not run successfully.
  │ exit code: 1
  ╰─> [3049 lines of output]
      running bdist_wheel
      running build
      running build_py
      creating build
      creating build\lib
      creating build\lib\transformers
      copying src\transformers\activations.py -> build\lib\transformers
      copying src\transformers\activations_tf.py -> build\li

In [3]:
import pandas as pd
import datasets
from transformers import AutoTokenizer
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores.utils import DistanceStrategy
from tqdm import tqdm
from transformers.agents import Tool, HfEngine, ReactJsonAgent
from huggingface_hub import InferenceClient
import logging

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# loading knowledge base
kb = datasets.load_dataset("m-ric/huggingface_doc", split="train")

Downloading readme: 100%|██████████| 21.0/21.0 [00:00<?, ?B/s]
Downloading data: 100%|██████████| 22.0M/22.0M [00:01<00:00, 19.7MB/s]
Generating train split: 100%|██████████| 2647/2647 [00:00<00:00, 4100.34 examples/s]


In [5]:
kb

Dataset({
    features: ['text', 'source'],
    num_rows: 2647
})

In [6]:
# Convert dataset to Document objects
source_docs = [
    Document(page_content=doc["text"], metadata={"source": doc["source"].split("/")[1]})
    for doc in kb
]

logger.info(f"Loaded {len(source_docs)} documents from the knowledge base")

INFO:__main__:Loaded 2647 documents from the knowledge base


In [7]:
# Initialize the text splitter
tokenizer = AutoTokenizer.from_pretrained("thenlper/gte-small")
text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
    tokenizer,
    chunk_size=200,
    chunk_overlap=20,
    add_start_index=True,
    strip_whitespace=True,
    separators=["\n\n", "\n", ".", " ", ""],
)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [8]:
# Split documents and remove duplicates
logger.info("Splitting documents...")
docs_processed = []
unique_texts = {}
for doc in tqdm(source_docs):
    new_docs = text_splitter.split_documents([doc])
    for new_doc in new_docs:
        if new_doc.page_content not in unique_texts:
            unique_texts[new_doc.page_content] = True
            docs_processed.append(new_doc)

logger.info(f"Processed {len(docs_processed)} unique document chunks")

INFO:__main__:Splitting documents...
100%|██████████| 2647/2647 [01:55<00:00, 22.95it/s]
INFO:__main__:Processed 43181 unique document chunks


In [9]:
# Initialize the embedding model
logger.info("Initializing embedding model...")
embedding_model = HuggingFaceEmbeddings(model_name="thenlper/gte-small")

# Create the vector database
logger.info("Creating vector database...")
vectordb = FAISS.from_documents(
    documents=docs_processed,
    embedding=embedding_model,
    distance_strategy=DistanceStrategy.COSINE,
)

logger.info("Vector database created successfully")

INFO:__main__:Initializing embedding model...
  warn_deprecated(
INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cpu
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: thenlper/gte-small
INFO:__main__:Creating vector database...


In [None]:
logger.info("Saved Vector database successfully")
vectordb.save_local("local")