## Load OpenAI's LLM

In [1]:
from langchain_openai import ChatOpenAI
import getpass
import os

os.environ["OPENAI_API_KEY"] = getpass.getpass()
llm = ChatOpenAI(model="gpt-4o-mini")

## Indexing
### Load Glossary

In [2]:
# Load the pdf documents from ./glossary
from langchain_community.document_loaders import PyPDFDirectoryLoader
loader = PyPDFDirectoryLoader("./glossary")
docs = loader.load()
print(len(docs), "documents loaded")

126 documents loaded


### Split
Our loaded document is over 42k characters long. This is too long to fit in the context window of many models. Even for those models that could fit the full post in their context window, models can struggle to find information in very long inputs.

To handle this we’ll split the Document into chunks for embedding and vector storage. This should help us retrieve only the most relevant bits of the blog post at run time.

In [3]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, chunk_overlap=200, add_start_index=True
)
all_splits = text_splitter.split_documents(docs)

len(all_splits)

370

### Store

Now we need to index our text chunks so that we can search over them at runtime. The most common way to do this is to **embed the contents of each document split** and **insert these embeddings into a vector database** (or vector store).

In [4]:
from langchain_chroma import Chroma # Chroma vector store
from langchain_openai import OpenAIEmbeddings

vectorstore = Chroma.from_documents(documents=all_splits, embedding=OpenAIEmbeddings(), persist_directory="vectorstore")

## Retrieval and Generation

### Retrieval

In [5]:
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 5})  
retrieved_docs = retriever.invoke("What is Model Minority?")

len(retrieved_docs)

5

### Generation

Read the example articles

In [6]:
len_articles = 640
example_articles = []
for i in range(1, len_articles + 1):
    with open(f"../articles/{i}.txt", "r") as f:
        example_articles.append(f.read())

print(len(example_articles))

640


In [7]:
from langchain_core.prompts import ChatPromptTemplate, PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough


def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

def format_docs_with_id(docs):
    formatted = [
        f"Source ID: {i}\nArticle Title: {doc.metadata['source']}\nArticle Snippet: {doc.page_content}"
        for i, doc in enumerate(docs)
    ]
    return "\n\n" + "\n\n".join(formatted)

prompt_template_path = "./prompt_template.txt"
with open(prompt_template_path, "r") as f:
    template = f.read()

custom_rag_prompt = PromptTemplate.from_template(template)

rag_chain_from_docs = (
    RunnablePassthrough.assign(context=(lambda x: format_docs_with_id(x["context"])))
    | custom_rag_prompt
    | llm
)

retrieve_docs = (lambda x: x["input"]) | retriever

chain = RunnablePassthrough.assign(context=retrieve_docs).assign(
    answer=rag_chain_from_docs
)

In [8]:
from tqdm import tqdm
results = []
for i, example_article in tqdm(enumerate(example_articles)):
    result = chain.invoke({"input": example_article})
    results.append(result)

12it [07:55, 39.63s/it]


KeyboardInterrupt: 

In [9]:
print(results[0]["answer"].content)

<result>
{
  "metadata": {
    "title": "Racism is the other virus sweeping America during this pandemic",
    "date": "2020-04-21",
    "publisher": "Chicago Tribune",
    "author": "Julie Morita",
    "asian": "true"
  },
  "racism_types": [
    "Macro-level racism > systematic racism > racial injustice/inequity and oppression",
    "Macro-level racism > systematic racism > white supremacy",
    "Macro-level racism > systematic racism > Racial bias",
    "Individual-level racism > Stereotype B (with hatred) > China/Chinese/Asian virus/Kung flu/plague/Ramen noodle flu",
    "Individual-level racism > Stereotype B (with hatred) > yellow peril",
    "Individual-level racism > Bigotry/prejudice > Scapegoat > Xenophobia"
  ],
  "entities": [
    {
      "name": "Julie Morita",
      "identity": "victim",
      "profession": "public health professional",
      "reactions": {
        "actions": [
          "speaks out against injustices",
          "advocates for a fair and just opportunity

## Save the results

In [None]:
# For each response, extract the content between <result></result> and then put it into a big JSON file
import json

results_json = []

for result in results:
    content = result['answer'].content
    # extract the content between <result></result>
    content = content.split("<result>")[1].split("</result>")[0]
    # Convert the content to JSON
    content_json = json.loads(content)
    results_json.append(content_json)

print(results_json)

[{'metadata': {'title': 'Anti-Asian violence shows folly of defunding police movement', 'date': '2021-03-21', 'publisher': 'USA Today', 'author': 'Tom Elias', 'asian': 'false'}, 'racism_types': ['Macro-level racism > systematic racism > racial injustice/inequity and oppression', 'Macro-level racism > systematic racism > white supremacy', 'Macro-level racism > systematic racism > Racial bias', 'Individual-level racism > Stereotype B (with hatred) > China/Chinese/Asian virus', 'Individual-level racism > Stereotype B (with hatred) > Perpetual/forever foreigner (Go back to China)', 'Individual-level racism > Bigotry/prejudice > Scapegoat > Racism toward Asian women', 'Individual-level racism > Bigotry/prejudice > Scapegoat > Xenophobia', 'Individual-level racism > Racial discrimination > Physical attack/violence > Anti-Asian hate crimes (investigated by police)', 'Individual-level racism > Racial discrimination > Physical attack/violence > Non-crime (not being investigated by police)'], 'e

In [None]:
# Save the results to a JSON file

with open("results.json", "w") as f:
    json.dump(results_json, f, indent=4)
