In [1]:
#
# https://github.com/langchain-ai/rag-from-scratch
#

In [None]:
! pip install langchain_community tiktoken langchain-openai langchainhub chromadb langchain bs4 faiss-cpu

In [3]:
# Load environment variables
from dotenv import load_dotenv
load_dotenv()

import os
os.environ['LANGCHAIN_TRACING_V2'] = 'true'
os.environ['LANGCHAIN_ENDPOINT'] = 'https://api.smith.langchain.com'
#os.environ['LANGCHAIN_API_KEY'] = <your-api-key>
#os.environ['OPENAI_API_KEY'] = <your-api-key>
os.environ['USER_AGENT'] = 'payoyo'


In [5]:
# Función de similaridad

import numpy as np

def cosine_similarity(vec1, vec2):
    dot_product = np.dot(vec1, vec2)
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)
    return dot_product / (norm_vec1 * norm_vec2)

In [6]:
# Función para contar tokens

import tiktoken

def num_tokens_from_string(string: str, encoding_name: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

In [None]:
# Documents
question = "What kinds of pets do I like?"
document = "My favorite pet is a cat."

num_tokens_from_string(question, "cl100k_base")

In [7]:
# Load Documents
import bs4
from langchain_community.document_loaders import WebBaseLoader

loader = WebBaseLoader(
    web_paths=("https://lilianweng.github.io/posts/2023-06-23-agent/",),
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("post-content", "post-title", "post-header")
        )
    ),
)
docs = loader.load()

In [8]:
# Split
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)

In [9]:
#### INDEXING ####

# Embed
from langchain_community.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings

vectorstore = Chroma.from_documents(documents=splits, 
                                    embedding=OpenAIEmbeddings())

retriever = vectorstore.as_retriever(search_type="similarity")

In [None]:
similarity = cosine_similarity(question, document)
print("Cosine Similarity:", similarity)

In [None]:
retrieved_docs = retriever.invoke("What are the approaches to Task Decomposition?")

len(retrieved_docs)

In [10]:
#### RETRIEVAL and GENERATION ####

# Prompt
from langchain import hub

prompt = hub.pull("rlm/rag-prompt")

In [11]:
# LLM
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)

In [12]:
# Post-processing
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [13]:
# Chain
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [None]:
# Question
rag_chain.invoke("What is Task Decomposition?")

Otro intento sencillo proveniente de CHATGPT

In [4]:
#from langchain.embeddings.openai import OpenAIEmbeddings
from langchain_openai import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.docstore.document import Document

In [5]:
# Initialize the embedding model
embeddings = OpenAIEmbeddings()

In [12]:
# Example text snippet and metadata (source reference)
texts = ["This is a text snippet about AI.", "One text about Ancient Roman economy."]
metadatas = [{"source": "Source A"}, {"source": "Source R"}]

In [13]:
# Generate embeddings and store them in FAISS
faiss_index = FAISS.from_texts(texts, embeddings, metadatas=metadatas)

In [14]:
# Querying the stored snippets (you can ask your LLM for this)
query = "Tell me about AI"
docs = faiss_index.similarity_search(query)

In [None]:
for doc in docs:
    print(f"Text: {doc.page_content}, Source: {doc.metadata['source']}")