<a href="https://colab.research.google.com/github/jaredmullane/LLM_Class/blob/main/TECH16_LLM_Lecture4_prepared.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install openai
!pip install sentence-transformers
!pip install langchain pypdf langchain-openai #tiktoken chromadb



# Standard imports[link text]

In [None]:
from openai import OpenAI
from google.colab import userdata

open_ai_key = userdata.get('open_ai_key')
client = OpenAI(api_key=open_ai_key)


# RAG

In [None]:
!pip install llama-index --upgrade



In [None]:
!pip install pypdf



In [None]:
!wget https://www.goldmansachs.com/intelligence/pages/gs-research/2024-us-equity-outlook-all-you-had-to-do-was-stay/report.pdf

--2024-02-22 05:24:29--  https://www.goldmansachs.com/intelligence/pages/gs-research/2024-us-equity-outlook-all-you-had-to-do-was-stay/report.pdf
Resolving www.goldmansachs.com (www.goldmansachs.com)... 23.213.120.152
Connecting to www.goldmansachs.com (www.goldmansachs.com)|23.213.120.152|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 491250 (480K) [application/pdf]
Saving to: ‘report.pdf.1’


2024-02-22 05:24:30 (9.39 MB/s) - ‘report.pdf.1’ saved [491250/491250]



In [None]:
import os
os.environ["OPENAI_API_KEY"] = open_ai_key

In [None]:
# Import necessary classes from the llama_index package
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader

# Read documents from the specified directory and load a specific document, "report.pdf".
documents = SimpleDirectoryReader("./").load_data("report.pdf")

# Create a VectorStoreIndex object from the documents. This will involve processing the documents
# and creating a vector representation for each of them, suitable for semantic searching.
index = VectorStoreIndex.from_documents(documents)

# Convert the VectorStoreIndex object into a query engine. This query engine can be used to
# perform semantic searches on the index, matching natural language queries to the most relevant
# documents in the index.
query_engine = index.as_query_engine()

# Use the query engine to search for documents that are relevant to the query
# from the indexed documents based on the semantic understanding of the query.
response = query_engine.query("What is the likelihood of a recession in 2024?")

# Print the response obtained from the query. This will display the result of the semantic search,
# showing the information or documents that best match the query about the 2024 outlook.
print(response)

Loading files: 100%|██████████| 2/2 [00:01<00:00,  1.64file/s]


The likelihood of a recession beginning in 2024, according to the Goldman Sachs economics view, is 15%.


In [None]:
response = query_engine.query("What did I just ask you?")
print(response)

You just asked me to provide an answer based on the given context information.


# Adding chat functionality

In [None]:
chat_engine = index.as_chat_engine(chat_mode="openai", tool_choice="query_engine_tool", verbose=False)

In [None]:
response = chat_engine.chat("Hi")
print(response)

Hello! How can I assist you today?


In [None]:
response = chat_engine.chat(
    "What is the likelihood of a recession in 2024?"
)
print(response)

The likelihood of a recession beginning in 2024, according to the Goldman Sachs economics view, is 15%.


In [None]:
response = chat_engine.chat(
    "What did I just ask you?"
)
print(response)

You asked about the likelihood of a recession in 2024.


In [None]:
response = chat_engine.chat(
    "Is that higher or lower than last year?"
)
print(response)

The likelihood of a recession in 2024 is lower than last year.


# Customize different parts
https://docs.llamaindex.ai/en/stable/getting_started/customization.html

### Chunking

In [None]:
from llama_index.core import ServiceContext

service_context = ServiceContext.from_defaults(chunk_size=1000)

  service_context = ServiceContext.from_defaults(chunk_size=1000)


In [None]:
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader

documents = SimpleDirectoryReader("./").load_data("report.pdf")
index = VectorStoreIndex.from_documents(
    documents, service_context=service_context
)
query_engine = index.as_query_engine()
response = query_engine.query("What is the likelihood of a recession in 2024?")
print(response)

Loading files: 100%|██████████| 2/2 [00:02<00:00,  1.28s/file]


The likelihood of a recession beginning in 2024, according to the Goldman Sachs economics view, is 15%.


### Vector DB

In [None]:
%pip install llama-index-vector-stores-weaviate



In [None]:
!pip install llama-index-vector-stores-chroma



In [None]:
import chromadb
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core import StorageContext

chroma_client = chromadb.PersistentClient()
chroma_collection = chroma_client.create_collection("tech16example")
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

In [None]:
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader

documents = SimpleDirectoryReader("./").load_data("report.pdf")
index = VectorStoreIndex.from_documents(
    documents, storage_context=storage_context
)
query_engine = index.as_query_engine()
response = query_engine.query("What is the 2024 outlook for US GDP?")
print(response)

Loading files: 100%|██████████| 2/2 [00:00<00:00,  2.23file/s]


The 2024 outlook for US GDP is forecasted to be above-consensus with a growth rate of 2.1%.


## Reranker

In [None]:
from llama_index.core.postprocessor import SentenceTransformerRerank

# BAAI/bge-reranker-base
# link: https://huggingface.co/BAAI/bge-reranker-base
rerank = SentenceTransformerRerank(
    top_n = 2,
    model = "BAAI/bge-reranker-base"
)

In [None]:
query_engine = index.as_query_engine(
    similarity_top_k = 6,
    vector_store_query_mode="hybrid",
    node_postprocessors = [rerank],
)
response = query_engine.query("What is the 2024 outlook for US GDP?")
print(response)

The 2024 outlook for US GDP is forecasted to show above-consensus growth of 2.1%.


# Query over structured data

In [None]:
%pip install llama-index-program-openai



In [None]:
%pip install llama-index-llms-openai



In [None]:
from llama_index.core.query_engine import PandasQueryEngine
import pandas as pd
from vega_datasets import data
from llama_index.core import PromptTemplate

# initialize empty df
df = data.movies()
df.head()

Unnamed: 0,Title,US_Gross,Worldwide_Gross,US_DVD_Sales,Production_Budget,Release_Date,MPAA_Rating,Running_Time_min,Distributor,Source,Major_Genre,Creative_Type,Director,Rotten_Tomatoes_Rating,IMDB_Rating,IMDB_Votes
0,The Land Girls,146083.0,146083.0,,8000000.0,Jun 12 1998,R,,Gramercy,,,,,,6.1,1071.0
1,"First Love, Last Rites",10876.0,10876.0,,300000.0,Aug 07 1998,R,,Strand,,Drama,,,,6.9,207.0
2,I Married a Strange Person,203134.0,203134.0,,250000.0,Aug 28 1998,,,Lionsgate,,Comedy,,,,6.8,865.0
3,Let's Talk About Sex,373615.0,373615.0,,300000.0,Sep 11 1998,,,Fine Line,,Comedy,,,13.0,,
4,Slam,1009819.0,1087521.0,,1000000.0,Oct 09 1998,R,,Trimark,Original Screenplay,Drama,Contemporary Fiction,,62.0,3.4,165.0


In [None]:
query_engine = PandasQueryEngine(df=df, verbose=True)
prompts = query_engine.get_prompts()
print(prompts["pandas_prompt"].template)

You are working with a pandas dataframe in Python.
The name of the dataframe is `df`.
This is the result of `print(df.head())`:
{df_str}

Follow these instructions:
{instruction_str}
Query: {query_str}

Expression:


In [None]:
new_prompt = PromptTemplate(
    """\
You are working with a pandas dataframe in Python.
The name of the dataframe is `df`.
The dataframe contains data about movies.
This is the result of `print(df.head())`:
{df_str}

Follow these instructions:
{instruction_str}
Query: {query_str}

Return the answer from the dataframe with a natural language explanation of the answer.
Expression: """
)

query_engine.update_prompts({"pandas_prompt": new_prompt})

In [None]:
response = query_engine.query(
    "List the top 5 directors with the most movies and how many they made",
)
print(str(response))

> Pandas Instructions:
```
df['Director'].value_counts().head(5)
```
> Pandas Output: Steven Spielberg    23
Woody Allen         16
Martin Scorsese     15
Spike Lee           15
Ridley Scott        14
Name: Director, dtype: int64
Steven Spielberg    23
Woody Allen         16
Martin Scorsese     15
Spike Lee           15
Ridley Scott        14
Name: Director, dtype: int64


# Multidoc, multiturn chat (UBER)

In [None]:
import nest_asyncio

nest_asyncio.apply()

In [None]:
!mkdir data
!wget "https://www.dropbox.com/s/948jr9cfs7fgj99/UBER.zip?dl=1" -O data/UBER.zip
!unzip data/UBER.zip -d data

In [None]:
!pip install llama-hub unstructured

In [None]:
from llama_index.readers.file import UnstructuredReader
from pathlib import Path

years = [2022, 2021, 2020, 2019]

loader = UnstructuredReader()
doc_set = {}
all_docs = []
for year in years:
    year_docs = loader.load_data(
        file=Path(f"./data/UBER/UBER_{year}.html"), split_documents=False
    )
    # insert year metadata into each year
    for d in year_docs:
        d.metadata = {"year": year}
    doc_set[year] = year_docs
    all_docs.extend(year_docs)

In [None]:
# initialize simple vector indices
from llama_index.core import VectorStoreIndex, StorageContext
from llama_index.core import Settings

Settings.chunk_size = 512
index_set = {}
for year in years:
    storage_context = StorageContext.from_defaults()
    cur_index = VectorStoreIndex.from_documents(
        doc_set[year],
        storage_context=storage_context,
    )
    index_set[year] = cur_index
    storage_context.persist(persist_dir=f"./storage/{year}")

In [None]:
# Load indices from disk
from llama_index.core import load_index_from_storage

index_set = {}
for year in years:
    storage_context = StorageContext.from_defaults(
        persist_dir=f"./storage/{year}"
    )
    cur_index = load_index_from_storage(
        storage_context,
    )
    index_set[year] = cur_index

In [None]:
from llama_index.core.tools import QueryEngineTool, ToolMetadata

individual_query_engine_tools = [
    QueryEngineTool(
        query_engine=index_set[year].as_query_engine(),
        metadata=ToolMetadata(
            name=f"vector_index_{year}",
            description=f"useful for when you want to answer queries about the {year} SEC 10-K for Uber",
        ),
    )
    for year in years
]

In [None]:
from llama_index.llms.openai import OpenAI
from llama_index.core.query_engine import SubQuestionQueryEngine

query_engine = SubQuestionQueryEngine.from_defaults(
    query_engine_tools=individual_query_engine_tools,
    llm=OpenAI(model="gpt-3.5-turbo"),
)

In [None]:
query_engine_tool = QueryEngineTool(
    query_engine=query_engine,
    metadata=ToolMetadata(
        name="sub_question_query_engine",
        description="useful for when you want to answer queries that require analyzing multiple SEC 10-K documents for Uber",
    ),
)

In [None]:
tools = individual_query_engine_tools + [query_engine_tool]


In [None]:
from llama_index.agent.openai import OpenAIAgent

agent = OpenAIAgent.from_tools(tools, verbose=True)

In [None]:
response = agent.chat("hi, i am bob")
print(str(response))

In [None]:
response = agent.chat(
    "What were some of the biggest risk factors in 2020 for Uber?"
)
print(str(response))

In [None]:
cross_query_str = "Compare/contrast the risk factors described in the Uber 10-K across years. Give answer in bullet points."

response = agent.chat(cross_query_str)
print(str(response))