In [1]:
import os
import torch
import torch.nn as nn
from transformers import AutoTokenizer
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores.faiss import FAISS
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from dotenv import load_dotenv

In [2]:
# KMP_DUPLICATE_LIB_OK=TRUE
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"

In [3]:
# Embedding test
SAMPLE_TEXT = "LangChain is a framework for developing applications powered by language models. It enables applications that: Are context-aware: connect a language model to sources of context (prompt instructions, few shot examples, content to ground its response in, etc.) Reason: rely on a language model to reason (about how to answer based on provided context, what actions to take, etc.)"
SAMPLE_DOCUMENT = [Document(page_content=SAMPLE_TEXT, metadata={"source": "local"})]

In [4]:
SAMPLE_DOCUMENT

[Document(page_content='LangChain is a framework for developing applications powered by language models. It enables applications that: Are context-aware: connect a language model to sources of context (prompt instructions, few shot examples, content to ground its response in, etc.) Reason: rely on a language model to reason (about how to answer based on provided context, what actions to take, etc.)', metadata={'source': 'local'})]

In [5]:
load_dotenv()
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY") or "YOUR_API_KEY"
# print("API Key:", os.getenv("OPENAI_API_KEY"))

In [6]:
text_spilter = RecursiveCharacterTextSplitter(chunk_size=1000 , chunk_overlap=100)
docs = text_spilter.split_documents(SAMPLE_DOCUMENT)

In [7]:
db = FAISS.from_documents(docs, OpenAIEmbeddings())

In [8]:
db.embeddings

OpenAIEmbeddings(client=<openai.resources.embeddings.Embeddings object at 0x000001FD8AED05D0>, async_client=<openai.resources.embeddings.AsyncEmbeddings object at 0x000001FD8AED3490>, model='text-embedding-ada-002', dimensions=None, deployment='text-embedding-ada-002', openai_api_version='', openai_api_base=None, openai_api_type='', openai_proxy='', embedding_ctx_length=8191, openai_api_key=SecretStr('**********'), openai_organization=None, allowed_special=set(), disallowed_special='all', chunk_size=1000, max_retries=2, request_timeout=None, headers=None, tiktoken_enabled=True, tiktoken_model_name=None, show_progress_bar=False, model_kwargs={}, skip_empty=False, default_headers=None, default_query=None, retry_min_seconds=4, retry_max_seconds=20, http_client=None)

In [9]:
db.similarity_search("LangChain", k=5)

[Document(page_content='LangChain is a framework for developing applications powered by language models. It enables applications that: Are context-aware: connect a language model to sources of context (prompt instructions, few shot examples, content to ground its response in, etc.) Reason: rely on a language model to reason (about how to answer based on provided context, what actions to take, etc.)', metadata={'source': 'local'})]

In [10]:
db.similarity_search("framework", k=5)

[Document(page_content='LangChain is a framework for developing applications powered by language models. It enables applications that: Are context-aware: connect a language model to sources of context (prompt instructions, few shot examples, content to ground its response in, etc.) Reason: rely on a language model to reason (about how to answer based on provided context, what actions to take, etc.)', metadata={'source': 'local'})]

In [11]:
db.similarity_search_with_score("framework", k=5)

[(Document(page_content='LangChain is a framework for developing applications powered by language models. It enables applications that: Are context-aware: connect a language model to sources of context (prompt instructions, few shot examples, content to ground its response in, etc.) Reason: rely on a language model to reason (about how to answer based on provided context, what actions to take, etc.)', metadata={'source': 'local'}),
  0.40820533)]