<a href="https://colab.research.google.com/github/kevalshah90/llms/blob/main/financial_rag_openai.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import getpass
import os

# Set your OpenAI API key
os.environ["OPENAI_API_KEY"] = getpass.getpass()

In [None]:
# For Opus grading
os.environ["ANTHROPIC_API_KEY"] = getpass.getpass()

In [None]:
!pip install -U -q langchain openai ragas arxiv pymupdf chromadb wandb tiktoken unstructured==0.12.5 datasets langchain_anthropic

# Download SEC filing

In [None]:
from langchain_community.document_loaders import UnstructuredURLLoader

url = "https://www.sec.gov/Archives/edgar/data/0001559720/000155972023000020/abnb-20230930.htm"
loader = UnstructuredURLLoader(urls=[url], headers={'User-Agent': 'virat virat@virat.com'})
documents = loader.load()

# Chunk and store filing in vector DB

In [None]:
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import TokenTextSplitter

# Naively chunk the SEC filing by tokens
token_splitter = TokenTextSplitter(chunk_size=256, chunk_overlap=20)
docs = token_splitter.split_documents(documents)

In [None]:
# Save the chunked docs in vector DB
vectorstore = Chroma.from_documents(docs, OpenAIEmbeddings(model="text-embedding-3-large"))

# Load DataFrame from storage (if exists)

In [None]:
saved_csv_path = "/content/drive/MyDrive/abnb-2023-annual_report-100-questions.csv"

In [None]:
import pandas as pd
import ast
from google.colab import drive


# Mount your Google Drive
drive.mount('/content/drive')

df = None

try:
    # Attempt to read the CSV file into a DataFrame
    df = pd.read_csv(saved_csv_path)

    # Convert contexts from str (default) to list of str
    df['contexts'] = df['contexts'].apply(ast.literal_eval)
except FileNotFoundError:
    # If the file is not found, initialize df to None
    df = None

if df is not None:
  print(f"Loaded DataFrame from storage.")
else:
  print("No DataFrame found in storage.")

# Generate a test dataset (if stored didn't exist)

In [None]:
import pandas as pd
from google.colab import drive
from ragas.testset.generator import TestsetGenerator
from ragas.testset.evolutions import simple, reasoning, multi_context

if df is None:
  # generator with openai models
  generator = TestsetGenerator.with_openai()

  # IMPORTANT: we use GPT-4 to generate the testset.  So, start with small test_size
  test_size = 100

  # generate testset
  print("Generating testset...")
  testset = generator.generate_with_langchain_docs(docs, test_size=test_size, distributions={simple: 0.40, reasoning: 0.40, multi_context: 0.20})
  df = testset.to_pandas()

  # Mount your Google Drive
  drive.mount('/content/drive')

  # Save the DataFrame as a CSV file to your Google Drive
  df.to_csv(saved_csv_path, index=False)

# Clean DataFrame

In [None]:
# visualize the dataset
df.head(5)

In [None]:
# Step 1: Drop rows with duplicate questions
filtered_df = df.drop_duplicates(subset='question', keep='first')

# Step 2: Remove rows with missing ground truth
condition_nan = pd.isna(filtered_df['ground_truth'])
condition_string_nan = filtered_df['ground_truth'].astype(str).str.lower() == 'nan'
filtered_df = filtered_df[~(condition_nan | condition_string_nan)]

# Step 3: Only keep columns we care about
filtered_df = filtered_df[['question', 'contexts', 'ground_truth']]

filtered_df.head()

# Generate answers using LLM

In [None]:
prompt = """
You are an advanced language model designed to
function as a financial assistant with expert-level
proficiency in reading and interpreting SEC filings.
Your primary role is to assist users in understanding
complex financial documents, extracting key information,
and providing clear, accurate answers to questions
related to these filings.
"""

In [None]:
from openai import OpenAI
from typing import List
import json

client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])

def expand_queries(query: str, model="gpt-3.5-turbo-0125") -> List[str]:
  response = client.chat.completions.create(
      model=model,
      response_format={"type": "json_object"},
      temperature=0,
      seed=42,
      messages=[
          {"role": "system", "content": "You are a helpful assistant that expands a user query into sub-queries. The sub-queries should be mutually exclusive and collectively exhaustive. Your response will be a JSON object with a `queries` field, which is a list of `query` objects."},
          {"role": "user", "content": query},
      ]
  )
  return json.loads(response.choices[0].message.content)

def rerank_documents(query: str, documents: list, top_k, model="gpt-3.5-turbo-0125") -> List[str]:
  response = client.chat.completions.create(
      model=model,
      temperature=0,
      messages=[
        {"role": "system", "content": f"You are an expert document ranker. Given a query and a list of documents, re-rank the documents by their relevancy to answering the question. Sort the list of documents from most relevant to the question to least relevant.  Only return the top {top_k} documents. Include the full document text in your response"},
        {"role": "user", "content": f"Query: {query} Documents: {documents}"}
      ]
    )
  return response.choices[0].message.content

def answer_question(query: str, documents: list, prompt: str, model="gpt-3.5-turbo-0125") -> str:
  response = client.chat.completions.create(
      model=model,
      temperature=0,
      seed=42,
      messages=[
          {"role": "system", "content": prompt},
          {"role": "user", "content": f"Please answer the question: ```{query}``` given the context: ```{documents}```. Optimize for conciseness and answer correctness."},
      ]
  )
  # Get and return the answer
  answer = response.choices[0].message.content
  return answer

In [None]:
question = filtered_df.iloc[1]['question']
ground_truth = filtered_df.iloc[1]['ground_truth']

print(f"Question: {question}")
print(f"Ground Truth: {ground_truth}")

top_k_docs = vectorstore.similarity_search(question, k=5)

# Extract the text content from documents
documents = [{"text": doc.page_content} for doc in top_k_docs]
print(f"Before reranking docs: {documents[0]}")

# Rerank the documents
documents = rerank_documents(question, documents, top_k=5)
print(f"After reranking docs: {documents}")

answer = answer_question(question, documents, prompt)
print(f"Answer: {answer}")

In [None]:
import time

answers = []
k = 5

# Fields for computing inference speed
total_time = 0.0
num_iterations = 0

# Execute RAG pipeline
for index, row in filtered_df.iterrows():
  # Get start time
  start_time = time.time()

  # Extract the question
  question = row['question']

  # Print current question
  print(f"Answering question {index + 1}:   {question}")

  # Query vector DB for documents
  top_k_docs = vectorstore.similarity_search(question, k)

  # Extract the text content from documents
  documents = [{"text": doc.page_content} for doc in top_k_docs]

  # Rerank the documents
  documents = rerank_documents(question, documents, k)

  # Ask the LLM
  answer = answer_question(question, documents, prompt)

  # Add generated answer to our list of answers
  answers.append(answer)

  # Get end time
  end_time = time.time()
  # Update total execution time (excluding sleep time)
  total_time += (end_time - start_time)
  num_iterations += 1

  # Sleep for 1 second to avoid overloading the LLM
  time.sleep(1)

# Add the generated answers as a new column in the DataFrame
filtered_df['answer'] = answers

In [None]:
# Calculate the average execution time
avg_time = total_time / num_iterations

print(f"Took {avg_time} avg seconds for each RAG call")

# Visually inspect the answers

In [None]:
filtered_df

# Evaluate answers using RAGAS

In [None]:
from datasets import Dataset

# Convert the DataFrame into a HuggingFace DataSet for RAGAS evaluation
dataset = Dataset.from_pandas(filtered_df)
dataset = dataset.remove_columns('__index_level_0__')

# Use GPT-4 for Evaluation

In [None]:
from langchain_openai import ChatOpenAI
from ragas import evaluate
from ragas.metrics import faithfulness, answer_correctness

gpt = ChatOpenAI(model_name="gpt-4-0125-preview", temperature=0)

gpt_result = evaluate(
    dataset,
    llm=gpt,
    metrics=[answer_correctness],
)

print(gpt_result)

# Use Claude 3 Opus for Evaluation

In [None]:
from langchain_anthropic import ChatAnthropic
from langchain_core.prompts import ChatPromptTemplate

claude = ChatAnthropic(
    temperature=0,
    model_name="claude-3-opus-20240229",
    anthropic_api_key=os.environ["ANTHROPIC_API_KEY"],
)

claude_result = evaluate(
    dataset,
    llm=claude,
    metrics=[answer_correctness],
)

print(claude_result)

# Get mean of both GPT and Claude scores

In [None]:
print(f"Average score: {(gpt_result['answer_correctness'] + claude_result['answer_correctness']) / 2}")