## Load OpenAI's LLM

In [4]:
from langchain_openai import ChatOpenAI
import getpass
import os

os.environ["OPENAI_API_KEY"] = getpass.getpass()
llm = ChatOpenAI(model="gpt-4o")

## Indexing
### Load Glossary

In [5]:
# Load the pdf documents from ./glossary
from langchain_community.document_loaders import PyPDFDirectoryLoader
loader = PyPDFDirectoryLoader("./glossary")
docs = loader.load()
print(len(docs), "documents loaded")

126 documents loaded


### Split
Our loaded document is over 42k characters long. This is too long to fit in the context window of many models. Even for those models that could fit the full post in their context window, models can struggle to find information in very long inputs.

To handle this we’ll split the Document into chunks for embedding and vector storage. This should help us retrieve only the most relevant bits of the blog post at run time.

In [6]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, chunk_overlap=200, add_start_index=True
)
all_splits = text_splitter.split_documents(docs)

len(all_splits)

370

### Store

Now we need to index our text chunks so that we can search over them at runtime. The most common way to do this is to **embed the contents of each document split** and **insert these embeddings into a vector database** (or vector store).

In [7]:
from langchain_chroma import Chroma # Chroma vector store
from langchain_openai import OpenAIEmbeddings

vectorstore = Chroma.from_documents(documents=all_splits, embedding=OpenAIEmbeddings())

## Retrieval and Generation

### Retrieval

In [12]:
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 5})  
retrieved_docs = retriever.invoke("What is Model Minority?")

len(retrieved_docs)

5

### Generation

Read the example articles

In [13]:
len_articles = 5
example_articles = []
for i in range(1, len_articles + 1):
    with open(f"./articles/{i}.txt", "r") as f:
        example_articles.append(f.read())

print(len(example_articles))

5


Read the concept tree

In [14]:
with open("./concept_tree.txt", "r") as f:
    concept_tree = f.read()

Citation Setup

In [15]:
from langchain_core.pydantic_v1 import BaseModel, Field
from operator import itemgetter
from typing import List


class CitedAnswer(BaseModel):
    """Answer the user question based only on the given sources, and cite the sources used."""

    answer: str = Field(
        ...,
        description="The analysis the user's news article (include main topics and relevant concepts), which is based only on the given sources and the concept tree.",
    )
    citations: List[int] = Field(
        ...,
        description="The source name of the SPECIFIC sources which justify the answer.",
    )

structured_llm = llm.with_structured_output(CitedAnswer)

In [38]:
from langchain_core.prompts import ChatPromptTemplate, PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

def format_docs_with_id(docs):
    formatted = [
        f"Source ID: {i}\nArticle Title: {doc.metadata['source']}\nArticle Snippet: {doc.page_content}"
        for i, doc in enumerate(docs)
    ]
    return "\n\n" + "\n\n".join(formatted)

template = """
You are a helpful assistant who can analyze news articles using provided context and a concept tree. Use the following context and concept tree to analyze the article at the end.

Context: {context}
Concept Tree: {concept_tree}
Article ID: {article_id}
Article: {input}

Analyze the article and output your analysis in .csv format with the following columns:

1. ARTICLE_ID: The unique identifier for the article (provided above)
2. CHOSEN_CONCEPT: The exact name of a relevant concept from the tree
3. EXPLANATION: Brief explanation of why the concept is relevant, including a quote if applicable

When identifying relevant concepts from the tree, consider both explicit mentions and implicit references. Include parent concepts of your chosen concept in output, each concept should be seperated by /.

Include both explicitly mentioned and implicitly referenced concepts. If there are ambiguities or multiple interpretations, include them in the EXPLANATION column. Create a new row for each relevant concept.

Begin your output with the header row, followed by the data rows:

ARTICLE_ID##CHOSEN_CONCEPT##EXPLANATION

Ensure that your output is strictly in ##-delimited format.

Now, begin your ##-delimited output:
"""
custom_rag_prompt = PromptTemplate.from_template(template)

rag_chain_from_docs = (
    RunnablePassthrough.assign(context=(lambda x: format_docs_with_id(x["context"])), concept_tree= lambda x: concept_tree)
    | custom_rag_prompt
    | structured_llm
)

retrieve_docs = (lambda x: x["input"]) | retriever

chain = RunnablePassthrough.assign(context=retrieve_docs).assign(
    answer=rag_chain_from_docs
)

In [42]:
results = []
for i, example_article in enumerate(example_articles):
    result = chain.invoke({"input": example_article, "article_id": i + 1})
    results.append(result)

## Save the results

In [43]:
import pandas as pd
import io

In [44]:
df_all = pd.DataFrame()
for result in results:
    result_dict = dict(result["answer"])
    data = io.StringIO(result_dict['answer'])
    df = pd.read_csv(data, index_col=False, sep="##")
    df_all = pd.concat([df_all, df])

  df = pd.read_csv(data, index_col=False, sep="##")
  df = pd.read_csv(data, index_col=False, sep="##")
  df = pd.read_csv(data, index_col=False, sep="##")
  df = pd.read_csv(data, index_col=False, sep="##")
  df = pd.read_csv(data, index_col=False, sep="##")


In [47]:
df_all.to_csv("output.csv", index=False)

In [49]:
# Create dataset for evaluation
# Columns: ARTICLE, CONCEPT_TREE, CHOSEN_CONCEPT, EXPLANATION
df_eval = pd.DataFrame()
df_eval = df_all.copy()
df_eval["ARTICLE"] = df_eval["ARTICLE_ID"].apply(lambda x: example_articles[int(x)-1])
df_eval["CONCEPT_TREE"] = concept_tree
df_eval = df_eval[["ARTICLE", "CONCEPT_TREE", "CHOSEN_CONCEPT", "EXPLANATION"]]
df_eval.to_csv("evaluation.csv", index=False)
