# Chroma Vector Store using LangChain


In [19]:
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS

In [11]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document

text = """
My name is Hasnain Yaqub. I am a beginner data scientist and GenAI learner from Pakistan.

I am currently learning Data Science and Artificial Intelligence through Codanics.com's 6 Months Data Science and AI Mentorship Program. My teacher and the founder of Codanics is Dr. The program covers Statistics, Mathematics, Python, Pandas, NumPy, Scikit-learn, PyTorch, SQL, Data Visualization, Machine Learning, Deep Learning, Generative AI, NLP, Prompt Engineering, Time Series Analysis, Streamlit, Flask, FastAPI, Tableau, and Power BI.

My core strength is data analysis. I am skilled in Exploratory Data Analysis, data preprocessing, data cleaning, data wrangling, feature understanding, and visualization using Python libraries.

I have completed multiple EDA projects including analysis of Google Play Store data, car dataset analysis with preprocessing and column correction, and e-commerce data analysis. These projects are showcased on my portfolio website.

I am actively learning Generative AI and LangChain. I have worked with prompts, output parsers, structured output, runnables, chains, and retrievers. I have built RAG based chatbots including a university information chatbot and a research paper explainer app using Streamlit.

I have experience using Hugging Face models, OpenRouter models, Groq LLMs, and transformer based embeddings. I understand text embeddings, semantic search, vector stores, and retrieval pipelines.

I use Python as my primary programming language. I work in Linux environments and manage virtual environments using Conda. I have experience resolving dependency issues related to NumPy, SciPy, Gensim, and transformer libraries.

I have basic backend knowledge using FastAPI. I understand HTTP methods, path and query parameters, Pydantic models, POST, PUT, and DELETE requests. I have worked on backend features such as user reviews storage using SQLite and API driven dashboards.

I am building my freelancing profiles on Fiverr and Upwork. I offer services related to data analysis, machine learning, deep learning, NLP, and AI based solutions. I also provide custom AI services.

My long term goal is to grow as an AI and ML engineer with a strong focus on Generative AI, RAG systems, and applied AI products.
"""

spliter = RecursiveCharacterTextSplitter(
    chunk_size=100,
    chunk_overlap=15
)

docs = spliter.split_text(
    text
)
docs = [Document(page_content=chunk) for chunk in docs]
print(docs)

[Document(metadata={}, page_content='My name is Hasnain Yaqub. I am a beginner data scientist and GenAI learner from Pakistan.'), Document(metadata={}, page_content="I am currently learning Data Science and Artificial Intelligence through Codanics.com's 6 Months"), Document(metadata={}, page_content='6 Months Data Science and AI Mentorship Program. My teacher and the founder of Codanics is Dr. The'), Document(metadata={}, page_content='is Dr. The program covers Statistics, Mathematics, Python, Pandas, NumPy, Scikit-learn, PyTorch,'), Document(metadata={}, page_content='PyTorch, SQL, Data Visualization, Machine Learning, Deep Learning, Generative AI, NLP, Prompt'), Document(metadata={}, page_content='NLP, Prompt Engineering, Time Series Analysis, Streamlit, Flask, FastAPI, Tableau, and Power BI.'), Document(metadata={}, page_content='My core strength is data analysis. I am skilled in Exploratory Data Analysis, data preprocessing,'), Document(metadata={}, page_content='preprocessing, dat

In [4]:
emb = HuggingFaceEmbeddings(
    model_name="nomic-ai/nomic-embed-text-v1",
    model_kwargs={
        "device": "cpu",              # use "cpu" if no GPU
        "trust_remote_code": True
    },
    encode_kwargs={"normalize_embeddings": True}
)


<All keys matched successfully>


In [14]:
# add documents
vector_store = FAISS.from_documents(docs, emb)


In [16]:
vector_store

<langchain_community.vectorstores.faiss.FAISS at 0x73fee0359820>

In [18]:
# search with similarity score
vector_store.similarity_search_with_score(
    query='machine learning and deep learning',
    k=2
)

[(Document(id='ac30a5b8-f8c7-49ee-92b1-9d581f40835c', metadata={}, page_content='to data analysis, machine learning, deep learning, NLP, and AI based solutions. I also provide'),
  np.float32(0.5497476)),
 (Document(id='b4f2f88b-4c37-4275-9982-e71ba4bb84cf', metadata={}, page_content='PyTorch, SQL, Data Visualization, Machine Learning, Deep Learning, Generative AI, NLP, Prompt'),
  np.float32(0.70201373))]