# Corrective Retrieval Augmented Generation

Going to create a Wikipedia document search.

In [1]:
import os
from pathlib import Path
from dotenv import load_dotenv

In [2]:
load_dotenv(Path.cwd().parent / ".env")

True

## Import embeddings

In [3]:
from langchain_openai import OpenAIEmbeddings
openai_embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

## Get Wikipedia data

In [4]:
import gzip
import json
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [18]:
wikipedia_filepath = "../datasets/simplewiki-2020-11-01.jsonl.tar.gz"

In [20]:
# start chunking
docs = []
with gzip.open(wikipedia_filepath, 'rt', encoding='utf8') as fIn:
    for line in fIn:
        data = json.loads(line.strip())
        docs.append({
            "metadata": {
                "title": data.get("title"),
                "article_id": data.get("id")
            },
            "data": ' '.join(data.get("paragraphs")[0:3])
        })
# to run things faster get a subset of docs
docs = [doc for doc in docs for x in ['india'] if x in doc["data"].lower().split()]
# create docs object
docs = [Document(
    page_content=doc["data"],
    metadata=doc["metadata"]
) for doc in docs]
# chunk
splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=300)
chunked_docs = splitter.split_documents(docs)

In [21]:
len(chunked_docs)

1322

In [22]:
chunked_docs[:3]

[Document(metadata={'title': 'Basil', 'article_id': '73985'}, page_content='Basil ("Ocimum basilicum") ( or ) is a plant of the Family Lamiaceae. It is also known as Sweet Basil or Tulsi. It is a tender low-growing herb that is grown as a perennial in warm, tropical climates. Basil is originally native to India and other tropical regions of Asia. It has been cultivated there for more than 5,000 years. It is prominently featured in many cuisines throughout the world. Some of them are Italian, Thai, Vietnamese and Laotian cuisines. It grows to between 30–60\xa0cm tall. It has light green, silky leaves 3–5\xa0cm long and 1–3\xa0cm broad. The leaves are opposite each other. The flowers are quite big. They are white in color and arranged as a spike. The plant tastes somewhat like anise, with a strong, pungent, sweet smell. Basil is very sensitive to cold. It is best grown in hot, dry conditions. While most common varieties are treated as annuals, some are perennial, including African Blue a

## Create a Vector DB and persist on disk

In [23]:
from langchain_chroma import Chroma

In [24]:
# create a vector db
chroma_db = Chroma.from_documents(
    documents=chunked_docs,
    collection_name='rag_wikipedia_db',
    embedding=openai_embeddings,
    collection_metadata={"hnsw:space": "cosine"},
    persist_directory="../wikipedia_db"
)

### Define a retrieval

In [25]:
similarity_threshold_retriever = chroma_db.as_retriever(
    search_type="similarity_score_threshold",
    search_kwargs={
        "k": 0.5,
        "score_threshold": 0.3
    }
)