In [8]:
import os
from dotenv import load_dotenv
from langchain_community.document_loaders import TextLoader, PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
from langchain_core.output_parsers import StrOutputParser

In [4]:
load_dotenv()
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")

In [14]:
from langchain_openai import OpenAIEmbeddings
from langchain_huggingface import HuggingFaceEmbeddings
# from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
# import faiss
from langchain_community.vectorstores import FAISS, Chroma
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain import hub
# import pprintr
from langchain_core.runnables import RunnablePassthrough

## With dimensions

In [15]:
embeddings = OpenAIEmbeddings(model="text-embedding-3-large", dimensions=1536)
embeddings

OpenAIEmbeddings(client=<openai.resources.embeddings.Embeddings object at 0x11d55ae40>, async_client=<openai.resources.embeddings.AsyncEmbeddings object at 0x11d55b770>, model='text-embedding-3-large', dimensions=1536, deployment='text-embedding-ada-002', openai_api_version=None, openai_api_base=None, openai_api_type=None, openai_proxy=None, embedding_ctx_length=8191, openai_api_key=SecretStr('**********'), openai_organization=None, allowed_special=None, disallowed_special=None, chunk_size=1000, max_retries=2, request_timeout=None, headers=None, tiktoken_enabled=True, tiktoken_model_name=None, show_progress_bar=False, model_kwargs={}, skip_empty=False, default_headers=None, default_query=None, retry_min_seconds=4, retry_max_seconds=20, http_client=None, http_async_client=None, check_embedding_ctx_length=True)

In [17]:
query_embeddings_results = embeddings.embed_query("What is the capital of France?")

In [18]:
len(query_embeddings_results)

1536

## Without dimensions

In [19]:
embeddings = OpenAIEmbeddings(model="text-embedding-3-large")
query_embeddings_results = embeddings.embed_query("What is the capital of France?")
len(query_embeddings_results)

3072

In [21]:
text_loader = TextLoader("data/speech.txt")
documents = text_loader.load()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
docs = text_splitter.split_documents(documents)
docs

result = embeddings.embed_documents([doc.page_content for doc in docs])
result


[[0.02589600533246994,
  -0.01472490094602108,
  -0.002396336058154702,
  0.05739879980683327,
  0.01721680723130703,
  0.012275470420718193,
  -0.01948217675089836,
  0.020657338201999664,
  0.020430801436305046,
  0.03233814984560013,
  0.017174331471323967,
  0.10466007143259048,
  -0.011801159009337425,
  0.0013025874504819512,
  -0.009096873924136162,
  0.0034352203365415335,
  -0.020402483642101288,
  0.035821154713630676,
  -0.0375484973192215,
  -0.011454273946583271,
  -0.00925969798117876,
  -0.009493313729763031,
  -0.03230983018875122,
  0.00462984899058938,
  0.0225970596075058,
  0.02372974529862404,
  -0.019821982830762863,
  0.017131855711340904,
  0.048054151237010956,
  0.04660997539758682,
  0.0006632399745285511,
  0.016579672694206238,
  0.02666056714951992,
  -0.010576443746685982,
  0.06065526604652405,
  -0.00546874338760972,
  0.010902090929448605,
  0.017726516351103783,
  0.0003867056511808187,
  0.0412297248840332,
  -0.002366249216720462,
  -0.0238854885101

In [24]:
len(result[0])

3072

In [23]:
result

[[0.02589600533246994,
  -0.01472490094602108,
  -0.002396336058154702,
  0.05739879980683327,
  0.01721680723130703,
  0.012275470420718193,
  -0.01948217675089836,
  0.020657338201999664,
  0.020430801436305046,
  0.03233814984560013,
  0.017174331471323967,
  0.10466007143259048,
  -0.011801159009337425,
  0.0013025874504819512,
  -0.009096873924136162,
  0.0034352203365415335,
  -0.020402483642101288,
  0.035821154713630676,
  -0.0375484973192215,
  -0.011454273946583271,
  -0.00925969798117876,
  -0.009493313729763031,
  -0.03230983018875122,
  0.00462984899058938,
  0.0225970596075058,
  0.02372974529862404,
  -0.019821982830762863,
  0.017131855711340904,
  0.048054151237010956,
  0.04660997539758682,
  0.0006632399745285511,
  0.016579672694206238,
  0.02666056714951992,
  -0.010576443746685982,
  0.06065526604652405,
  -0.00546874338760972,
  0.010902090929448605,
  0.017726516351103783,
  0.0003867056511808187,
  0.0412297248840332,
  -0.002366249216720462,
  -0.0238854885101