In [3]:
from langchain_community.document_loaders import HuggingFaceDatasetLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from transformers import AutoTokenizer, AutoModelForCausalLM,pipeline
from langchain_huggingface import HuggingFacePipeline
# RetrievalQA is deprecated in newer LangChain versions
# Use create_retrieval_chain or LCEL pattern instead
# from langchain.chains import RetrievalQA  # Deprecated

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
from dotenv import load_dotenv
import os
from huggingface_hub import login

# Load environment variables from .env file
load_dotenv()

# Get Hugging Face token from .env file
hf_token = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACE_TOKEN") or os.getenv("HUGGINGFACE_HUB_TOKEN")

if not hf_token:
    raise ValueError("HF_TOKEN, HUGGINGFACE_TOKEN, or HUGGINGFACE_HUB_TOKEN not found in .env file")

# Login to Hugging Face
login(token=hf_token)

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [5]:
# Specify the dataset name and the column containing the content
dataset_name = "ruslanmv/ai-medical-chatbot"


# Create a loader instance
loader_doctor = HuggingFaceDatasetLoader(dataset_name,"Doctor")

In [6]:
# # Load the data
doctor_data = loader_doctor.load()

# Select the first 1000 entries
doctor_data = doctor_data[:1000]

doctor_data[:2]

Generating train split: 100%|██████████| 256916/256916 [00:00<00:00, 1591777.18 examples/s]


[Document(metadata={'Description': 'Q. What does abutment of the nerve root mean?', 'Patient': 'Hi doctor,I am just wondering what is abutting and abutment of the nerve root means in a back issue. Please explain. What treatment is required for\xa0annular bulging and tear?'}, page_content='"Hi. I have gone through your query with diligence and would like you to know that I am here to help you. For further information consult a neurologist online -->"'),
 Document(metadata={'Description': 'Q. What should I do to reduce my weight gained due to genetic hypothyroidism?', 'Patient': 'Hi doctor, I am a 22-year-old female who was diagnosed with hypothyroidism (genetic) when I was 12. Over the past five years, I have become around 50 pounds overweight and all of my attempts to lose have seemed to fail so I have given up, but my weight has stayed the same. There is so much information put there about losing weight with hypothyroidism but it all seems to conflict. I am so unsure as to what type o

In [9]:
# Define the path to the pre-trained model you want to use
modelPath = "sentence-transformers/all-MiniLM-L12-v2"

# Create a dictionary with model configuration options, specifying to use the GPU for computations
model_kwargs = {'device':'cpu'}

# Create a dictionary with encoding options, specifically setting 'normalize_embeddings' to False
encode_kwargs = {'normalize_embeddings': False}

# Initialize an instance of HuggingFaceEmbeddings with the specified parameters
embeddings = HuggingFaceEmbeddings(
    model_name=modelPath,     
    model_kwargs=model_kwargs, 
    encode_kwargs=encode_kwargs
)

In [10]:
text = "Why are you a doctor?"
query_result = embeddings.embed_query(text)
query_result[:3]

[-0.05935199186205864, 0.08008933812379837, 0.040729619562625885]

In [12]:
vector_db = FAISS.from_documents(doctor_data, embeddings)
vector_db.save_local("db/faiss_doctor_index")

ImportError: Could not import faiss python package. Please install it with `pip install faiss-gpu` (for CUDA supported GPU) or `pip install faiss-cpu` (depending on Python version).