# A simple RAG application using open-source models

# Set up

In [12]:
import os
from dotenv import load_dotenv

load_dotenv()

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
MODEL = "gpt-4o-mini"
# MODEL = "gpt-3.5-turbo"
# MODEL = "mixtral:8x7b"
# MODEL = "llama2"

from langchain_community.llms import Ollama
from langchain_openai.chat_models import ChatOpenAI
from langchain_community.embeddings import OllamaEmbeddings
from langchain_openai.embeddings import OpenAIEmbeddings

if MODEL.startswith("gpt"):
    model = ChatOpenAI(openai_api_key=OPENAI_API_KEY, model=MODEL)
    embeddings = OpenAIEmbeddings()
else:
    model = Ollama(model=MODEL)
    embeddings = OllamaEmbeddings(model=MODEL)

model.invoke("Tell me a joke")

AIMessage(content='Why did the scarecrow win an award? \n\nBecause he was outstanding in his field!', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 18, 'prompt_tokens': 11, 'total_tokens': 29}, 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_48196bc67a', 'finish_reason': 'stop', 'logprobs': None}, id='run-48d317cb-10fb-4010-91ee-dfcfa4f8acb1-0', usage_metadata={'input_tokens': 11, 'output_tokens': 18, 'total_tokens': 29})

# System prompt

In [2]:
from langchain.prompts import PromptTemplate

template = """
Answer the question based on the context below. If you can't 
answer the question, reply "None".

Context: {context}

Question: {question}
"""

prompt = PromptTemplate.from_template(template)
prompt.format(context="Here is some context", question="Here is a question")

'\nAnswer the question based on the context below. If you can\'t \nanswer the question, reply "None".\n\nContext: Here is some context\n\nQuestion: Here is a question\n'

# Input website

In [6]:
from langchain.document_loaders import TextLoader
import os

# Directory containing the Markdown files
dir_path = os.path.abspath("../data/math_books")
# List to hold all pages from all files
all_pages = []

# Iterate over each file in the directory
i = 0
for filename in os.listdir(dir_path):
    i += 1
    if i > 5:
        break
    if filename.endswith(".md"):
        file_path = os.path.join(dir_path, filename)
        print(file_path)
        # Load the Markdown file
        loader = TextLoader(file_path, encoding="utf-8")
        pages = loader.load_and_split()
        print(filename, len(pages))
        # Add pages to the list
        all_pages.extend(pages)

# all_pages now contains pages from all Markdown files in the directory
print(len(all_pages))
print(all_pages)

c:\Users\Halyna\workspace\python\gumroad\data\math_books\1.md
1.md 1
c:\Users\Halyna\workspace\python\gumroad\data\math_books\10.md
10.md 1
c:\Users\Halyna\workspace\python\gumroad\data\math_books\11.md
11.md 3
c:\Users\Halyna\workspace\python\gumroad\data\math_books\12.md
12.md 1
c:\Users\Halyna\workspace\python\gumroad\data\math_books\13.md
13.md 1
7
[Document(metadata={'source': 'c:\\Users\\Halyna\\workspace\\python\\gumroad\\data\\math_books\\1.md'}, page_content="Title: A Programmer's Introduction to Mathematics: Second Edition (pdf)\n\nURL Source: https://j2kun.gumroad.com/l/pim-book?layout=discover&recommended_by=search&_gl=1*aznv3s*_ga*MTM1NzY2MTEwNy4xNzIzMTczMTQ3*_ga_6LJN6D94N6*MTcyNDAzMzMzNy4xMC4xLjE3MjQwMzMzNjUuMC4wLjA.\n\nMarkdown Content:\n207 ratings\n\nA Programmer's Introduction to Mathematics uses your familiarity with ideas from programming and software to teach mathematics.\n\nYou'll learn about the central objects and theorems of mathematics, covering graphs, calcul

In [13]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import DocArrayInMemorySearch

# Set up the text splitter with desired chunk size and overlap
text_splitter = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=50)

# Split the documents into chunks
split_documents = text_splitter.split_documents(all_pages)

# Create the vector store from the split documents
vectorstore = DocArrayInMemorySearch.from_documents(
    split_documents, embedding=embeddings
)
retriever = vectorstore.as_retriever(search_kwargs={"k": 7})
print(retriever.invoke("what dou you know about math"))
print()
print(retriever.invoke("Find a math book for a school student"))
print()
print(retriever.invoke("Find a math book about cryptography"))
print()

[Document(metadata={'source': 'c:\\Users\\Halyna\\workspace\\python\\gumroad\\data\\math_books\\1.md'}, page_content="You'll learn about the central objects and theorems of mathematics, covering graphs, calculus, linear algebra, eigenvalues, optimization, and more. You'll also be immersed in the often unspoken cultural attitudes of mathematics, learning both how to read and write proofs while understanding why"), Document(metadata={'source': 'c:\\Users\\Halyna\\workspace\\python\\gumroad\\data\\math_books\\1.md'}, page_content="Markdown Content:\n207 ratings\n\nA Programmer's Introduction to Mathematics uses your familiarity with ideas from programming and software to teach mathematics."), Document(metadata={'source': 'c:\\Users\\Halyna\\workspace\\python\\gumroad\\data\\math_books\\1.md'}, page_content='on your own. By the end of the book, you will be able to learn mathematics on your own. In short, this book will teach you to engage with mathematics.'), Document(metadata={'source': '

# Embed & index

# Pipeline

In [7]:
from operator import itemgetter
from langchain_core.output_parsers import StrOutputParser

parser = StrOutputParser()
chain = (
    {
        "context": itemgetter("question") | retriever,
        "question": itemgetter("question"),
    }
    | prompt
    | model
    | parser
)


# chain.invoke(
#     {"context": "My parents named me Santiago", "question": "What's your name'?"}
# )

# Questions

In [8]:
# questions list
questions = [
    "Find a math book",
    "Find a math book about cryptography",
]
    


# Process

In [9]:
for question in questions:
    print(f"Question: {question}")
    print(f"Answer: {chain.invoke({'question': question})}")
    print()

Question: Find a math book
Answer: Title: A Programmer's Introduction to Mathematics: Second Edition

Question: Find a math book about cryptography
Answer: None



In [23]:
chain.batch([{"question": q} for q in questions])

['AlphaVentures',
 'The official website of the fund is [www.alphaventuresfund.com](http://www.alphaventuresfund.com).',
 'The fund primarily makes equity investments in early-stage technology startups. Additionally, it also invests in convertible debt and bridge financing.',
 'The fund is headquartered in San Francisco, CA, USA.',
 'The fund invests in the following industries:\n- Technology\n- Healthcare\n- Green Energy\n\nFormatted as:\n- Tech\n- Health\n- Sustainability\n\nMaster categories:\n- Technology\n- Life Sciences\n- Renewable Energy']

In [109]:
# from langchain_community.document_loaders import WebBaseLoader
# from langchain_text_splitters import RecursiveCharacterTextSplitter

# loader = WebBaseLoader("https://www.ml.school")
# docs = loader.load()
# documents = RecursiveCharacterTextSplitter(
#     chunk_size=1000, chunk_overlap=200
# ).split_documents(docs)

# documents

[Document(page_content='Building Machine Learning Systems That Don\'t Suck"This is the best machine learning course I\'ve done ever. Worth every cent."Jose Reyes, AI/ML at Cevo AustraliaBuilding Machine Learning Systems That Don\'t SuckA live, interactive program that\'ll help you build production-ready machine learning systems from the ground up.Next cohort:\xa0April 8 - 25, 2024Check the schedule for more details about upcoming cohorts.Register nowLearn how to design, build, deploy, and scale machine learning systems to solve real-world problems.I\'ll lose my mind if I see another book or course teaching people the same basic ideas for the hundredth time. Most people are stuck in beginner mode, and finding help to solve real-world problems is hard.I want to change that.I started writing software 30 years ago. I\'ve written pipelines and trained models for some of the largest companies in the world. I want to show you how to do the same.This is the class I wish I had taken when I star