In [2]:
!pip install langchain langchain-community faiss-cpu sentence-transformers pandas

Collecting langchain-community
  Downloading langchain_community-0.3.27-py3-none-any.whl.metadata (2.9 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0.post1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.0 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.10.1-py3-none-any.whl.metadata (3.4 kB)
Collecting httpx-sse<1.0.0,>=0.4.0 (from langchain-community)
  Downloading httpx_sse-0.4.1-py3-none-any.whl.metadata (9.4 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading marshmallow-3.26.1-py3-none-any.whl.metadata (7.3 kB)
Collecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading typing_inspect-0.9.0-py3-none-any.whl.metadata (1.5 kB)
Collecting python-dote

In [3]:
import os
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
import pandas as pd
from langchain.docstore.document import Document

In [4]:
vectordb_file_path = "faiss_index"

def create_vector_db():
    print("Initializing a fast, local embedding model to run on your CPU...")
    embeddings = HuggingFaceEmbeddings(
        model_name="sentence-transformers/all-MiniLM-L6-v2",
        model_kwargs={'device': 'cpu'}
    )

    csv_path = "/content/medquad_parsed_data.csv"
    chunk_size = 500

    print(f"Loading data from {csv_path} in chunks of {chunk_size}...")
    # Add error_bad_lines=False to be more robust against potential CSV errors
    reader = pd.read_csv(csv_path, chunksize=chunk_size, encoding='utf-8', on_bad_lines='skip')

    vectordb = None

    for i, chunk in enumerate(reader):
        print(f"--- Processing chunk {i+1} ---")
        chunk = chunk.dropna(subset=['question', 'answer'])
        documents = [
            Document(
                page_content=row['answer'],
                metadata={'source': row['question']}
            ) for index, row in chunk.iterrows()
        ]

        if not documents:
            print(f"Chunk {i+1} is empty after cleaning. Skipping.")
            continue

        if vectordb is None:
            print(f"Creating initial vector database from chunk {i+1}...")
            vectordb = FAISS.from_documents(documents=documents, embedding=embeddings)
        else:
            print(f"Adding chunk {i+1} to the existing vector database...")
            vectordb.add_documents(documents=documents)

    if vectordb:
        print(f"\nSaving final vector database to {vectordb_file_path}...")
        vectordb.save_local(vectordb_file_path)
        print("Vector database created and saved successfully.")
    else:
        print("No data was processed. Vector database not created.")

# Main execution block
if not os.path.exists(vectordb_file_path):
    create_vector_db()
else:
    print("Vector database already exists. If you want to recreate it, delete the 'faiss_index' folder.")


Initializing a fast, local embedding model to run on your CPU...


  embeddings = HuggingFaceEmbeddings(
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Loading data from /content/medquad_parsed_data.csv in chunks of 500...
--- Processing chunk 1 ---
Chunk 1 is empty after cleaning. Skipping.
--- Processing chunk 2 ---
Chunk 2 is empty after cleaning. Skipping.
--- Processing chunk 3 ---
Chunk 3 is empty after cleaning. Skipping.
--- Processing chunk 4 ---
Chunk 4 is empty after cleaning. Skipping.
--- Processing chunk 5 ---
Chunk 5 is empty after cleaning. Skipping.
--- Processing chunk 6 ---
Chunk 6 is empty after cleaning. Skipping.
--- Processing chunk 7 ---
Chunk 7 is empty after cleaning. Skipping.
--- Processing chunk 8 ---
Chunk 8 is empty after cleaning. Skipping.
--- Processing chunk 9 ---
Chunk 9 is empty after cleaning. Skipping.
--- Processing chunk 10 ---
Chunk 10 is empty after cleaning. Skipping.
--- Processing chunk 11 ---
Chunk 11 is empty after cleaning. Skipping.
--- Processing chunk 12 ---
Chunk 12 is empty after cleaning. Skipping.
--- Processing chunk 13 ---
Chunk 13 is empty after cleaning. Skipping.
--- Process

  return forward_call(*args, **kwargs)


--- Processing chunk 64 ---
Adding chunk 64 to the existing vector database...


  return forward_call(*args, **kwargs)


--- Processing chunk 65 ---
Adding chunk 65 to the existing vector database...


  return forward_call(*args, **kwargs)


--- Processing chunk 66 ---
Adding chunk 66 to the existing vector database...


  return forward_call(*args, **kwargs)


--- Processing chunk 67 ---
Adding chunk 67 to the existing vector database...


  return forward_call(*args, **kwargs)


--- Processing chunk 68 ---
Adding chunk 68 to the existing vector database...


  return forward_call(*args, **kwargs)


--- Processing chunk 69 ---
Adding chunk 69 to the existing vector database...


  return forward_call(*args, **kwargs)


--- Processing chunk 70 ---
Adding chunk 70 to the existing vector database...


  return forward_call(*args, **kwargs)


--- Processing chunk 71 ---
Adding chunk 71 to the existing vector database...


  return forward_call(*args, **kwargs)


--- Processing chunk 72 ---
Adding chunk 72 to the existing vector database...


  return forward_call(*args, **kwargs)


--- Processing chunk 73 ---
Adding chunk 73 to the existing vector database...


  return forward_call(*args, **kwargs)


--- Processing chunk 74 ---
Adding chunk 74 to the existing vector database...


  return forward_call(*args, **kwargs)


--- Processing chunk 75 ---
Adding chunk 75 to the existing vector database...


  return forward_call(*args, **kwargs)


--- Processing chunk 76 ---
Adding chunk 76 to the existing vector database...


  return forward_call(*args, **kwargs)


--- Processing chunk 77 ---
Adding chunk 77 to the existing vector database...


  return forward_call(*args, **kwargs)


--- Processing chunk 78 ---
Adding chunk 78 to the existing vector database...


  return forward_call(*args, **kwargs)


--- Processing chunk 79 ---
Adding chunk 79 to the existing vector database...


  return forward_call(*args, **kwargs)


--- Processing chunk 80 ---
Adding chunk 80 to the existing vector database...


  return forward_call(*args, **kwargs)


--- Processing chunk 81 ---
Adding chunk 81 to the existing vector database...


  return forward_call(*args, **kwargs)


--- Processing chunk 82 ---
Adding chunk 82 to the existing vector database...


  return forward_call(*args, **kwargs)


--- Processing chunk 83 ---
Adding chunk 83 to the existing vector database...


  return forward_call(*args, **kwargs)


--- Processing chunk 84 ---
Adding chunk 84 to the existing vector database...


  return forward_call(*args, **kwargs)


--- Processing chunk 85 ---
Adding chunk 85 to the existing vector database...


  return forward_call(*args, **kwargs)


--- Processing chunk 86 ---
Adding chunk 86 to the existing vector database...


  return forward_call(*args, **kwargs)


--- Processing chunk 87 ---
Adding chunk 87 to the existing vector database...


  return forward_call(*args, **kwargs)


--- Processing chunk 88 ---
Adding chunk 88 to the existing vector database...


  return forward_call(*args, **kwargs)


--- Processing chunk 89 ---
Adding chunk 89 to the existing vector database...


  return forward_call(*args, **kwargs)


--- Processing chunk 90 ---
Adding chunk 90 to the existing vector database...


  return forward_call(*args, **kwargs)


--- Processing chunk 91 ---
Adding chunk 91 to the existing vector database...


  return forward_call(*args, **kwargs)


--- Processing chunk 92 ---
Adding chunk 92 to the existing vector database...


  return forward_call(*args, **kwargs)


--- Processing chunk 93 ---
Adding chunk 93 to the existing vector database...


  return forward_call(*args, **kwargs)


--- Processing chunk 94 ---
Adding chunk 94 to the existing vector database...


  return forward_call(*args, **kwargs)


--- Processing chunk 95 ---
Adding chunk 95 to the existing vector database...


  return forward_call(*args, **kwargs)



Saving final vector database to faiss_index...
Vector database created and saved successfully.


In [5]:
# After the process is complete, run this cell to zip the folder.
!zip -r faiss_index.zip faiss_index

  adding: faiss_index/ (stored 0%)
  adding: faiss_index/index.pkl (deflated 76%)
  adding: faiss_index/index.faiss (deflated 10%)
