In [23]:
import os
import platform

import openai
import chromadb
import langchain

from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.text_splitter import TokenTextSplitter,RecursiveCharacterTextSplitter
from langchain.llms import OpenAI
from langchain.chains import ChatVectorDBChain
from langchain.document_loaders import GutenbergLoader,DirectoryLoader,TextLoader

import os,sys
sys.path.insert(0,'../../libs')
from utils import load_json
from llm_utils import tiktoken_len
print('Python: ', platform.python_version())

import ssl
if (not os.environ.get('PYTHONHTTPSVERIFY', '') and getattr(ssl, '_create_unverified_context', None)):
    ssl._create_default_https_context = ssl._create_unverified_context

Python:  3.8.13


#### Load custom data 

In [71]:
# Loading from a directory
Knowledge_Base_Folder = '/data/chuang/QA_LangChan/Knowledge_Base'
Index_Save_Path = '/data/chuang/QA_LangChan/KB_Index/chroma'

In [21]:
loader = DirectoryLoader(Knowledge_Base_Folder, glob="**/*.txt",loader_cls=TextLoader)
docs = loader.load()
for i in range(3):
    print(docs[i].metadata)

{'source': '/data/chuang/QA_LangChan/Knowledge_Base/2022_FM_April_0.txt'}
{'source': '/data/chuang/QA_LangChan/Knowledge_Base/2022_WEO_April_0.txt'}
{'source': '/data/chuang/QA_LangChan/Knowledge_Base/2022_GFSR_April_0.txt'}


#### Split text into meaningful small chunks 
- We can do it pagragph by pargaph once the data is fully cleaned 
- for now, just put them into chunks using RecursiveCharacterTextSplitter 
-  It tries to split on them in order until the chunks are small enough. The default list is ["\n\n", "\n", " ", ""]. This has the effect of trying to keep all paragraphs (and then sentences, and then words) together as long as possible, as those would generically seem to be the strongest semantically related pieces of text.

In [37]:
text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size = 400,      ## you can set this size based on your embeding models size
    chunk_overlap  = 20,
    length_function = tiktoken_len,
)
## process all docs 
allchunks = []
for d in docs:
    allchunks.extend(text_splitter.create_documents([d.page_content],metadatas=[d.metadata]))

In [55]:
for i in range(2):
    print(allchunks[i].metadata,allchunks[i].lookup_index)
    print("token length : {}".format(tiktoken_len(allchunks[i].page_content)))
    print(allchunks[i].page_content)
    

{'source': '/data/chuang/QA_LangChan/Knowledge_Base/2022_FM_April_0.txt'} 0
token length : 380
Description: Washington, DC : International Monetary Fund, 2009- | Semiannual | Some issues also have thematic titles.
Publication orders may be placed online, by fax, or through the mail: International Monetary Fund, Publication Services
Online Annex 1.2. Analysis of Poverty, Social Safety Nets, and Informality Online Annex 1.3. Inflation and Fiscal Nexus: Empirical Findings
Online Annex 2.1. Estimating the Revenue Impact of Pillar 1 and 2 Online Annex 2.2. Corporate Tax Rate Strategic Reaction
Online Annex 2.3. Survey of International Coordination and Tax Administration Online Annex 2.4. Revenue Implications of Cross-Border Remote Work
— to indicate that the figure is zero or less than half the final digit shown, or that the item does not exist
– between years or months (for example, 2008–09 or January–June) to indicate the years or months covered, including the beginning and ending years o

#### Creating Embedings 

In [57]:
### Load all API keys 
openai_key = load_json('/home/chuang/Dev/Keys/openai_key.json') 
hf_key = load_json('/home/chuang/Dev/Keys/huggingface_key.json')
os.environ['OPENAI_API_KEY'] = openai_key['ChatGPT']['API_KEY']
os.environ['HUGGINGFACEHUB_API_TOKEN'] = hf_key['HuggingFace']['API_KEY']

- initialize model with openai embeding; of course you can use other mebeding models as well 

In [58]:
model_name = 'text-embedding-ada-002'  ## this is the recommended embeding modle for gpt3.5

embed = OpenAIEmbeddings(
    document_model_name=model_name,
    query_model_name=model_name,        ## you can set different model to embed queries 
    openai_api_key=os.environ['OPENAI_API_KEY'] 
)

In [70]:
## try one example 
res = embed.embed_documents([allchunks[0].page_content]) ## need to put them in list 
print("embeding size :",len(res), len(res[0]))

embeding size : 1 1536


- embeding everything and samve to chroma 

In [72]:
#docsearch = Chroma.from_texts(texts, embeddings)
vectordb = Chroma.from_documents(allchunks, embed,persist_directory=Index_Save_Path)
vectordb.persist()

Running Chroma using direct local API.
No existing DB found in /data/chuang/QA_LangChan/KB_Index/chroma, skipping load
No existing DB found in /data/chuang/QA_LangChan/KB_Index/chroma, skipping load
Persisting DB to disk, putting it in the save folder /data/chuang/QA_LangChan/KB_Index/chroma


- load back from local 

In [73]:
# Now we can load the persisted database from disk, and use it as normal. 
vectordb = Chroma(persist_directory=Index_Save_Path, embedding_function=embed)

Running Chroma using direct local API.
loaded in 556 embeddings
loaded in 1 collections
collection with name langchain already exists, returning existing collection
PersistentDuckDB del, about to run persist
Persisting DB to disk, putting it in the save folder /data/chuang/QA_LangChan/KB_Index/chroma


In [92]:
Q = "weo macro-economic forecast in 2022" 
res_docs = vectordb.similarity_search_with_score(Q,k=10) ## with this wraper, you can only filter by matadata
t,score = res_docs[-1]
print(t.page_content)
print(score)

lobal economic prospects have worsened significantly since our last World Economic Outlook forecast in January. At the time, we had projected the global recovery to
strengthen from the second quarter of this year after a short-lived impact of the Omicron variant. Since then, the outlook has deteriorated, largely because of Russia’s invasion of Ukraine—causing a tragic
This crisis unfolds while the global economy was on a mending path but had not yet fully recovered from the COVID-19 pandemic, with a significant diver- gence between the economic recoveries of advanced economies and emerging market and developing ones. In addition to the war, frequent and wider-ranging lockdowns in China—including in key manufactur- ing hubs—have also slowed activity there and could cause new bottlenecks in global supply chains. Higher, broader, and more persistent price pressures also led
to a tightening of monetary policy in many coun- tries. Overall risks to economic prospects have risen sharply and p

#### vector matching with keywords, and metadata filtering 
- please see [Chroma doc](https://docs.trychroma.com/usage-guide#querying-a-collection) for details 
- see chroma object code [here](https://github.com/hwchase17/langchain/blob/75149d6d38cc8952ebaf13d7b9fe48c466dbfa19/langchain/vectorstores/chroma.py#L19)

In [157]:
from langchain.vectorstores.chroma import _results_to_docs_and_scores

In [160]:
## vector search with "china" as keyword
q_embedding = vectordb._embedding_function.embed_query(Q)
res = vectordb._collection.query(query_embeddings=q_embedding,n_results=5,
                                 where_document={"$or":[
                                                {"$contains":"China"},
                                                {"$contains":"china"}
                                                ]})
r = _results_to_docs_and_scores(res)
print(r[0])

(Document(page_content='A number of assumptions have been adopted for the projections presented in the World Economic Outlook (WEO). It has been assumed that real effective exchange rates remained constant at their average levels during February 22, 2022 to March 22, 2022, except for those for the currencies participating in the European exchange rate mechanism II, which are assumed to have remained constant in nominal terms relative to the euro; that established policies of national authorities will be maintained (for specific assumptions about fiscal and monetary policies for selected economies, see Box A1 in the Statistical Appendix); that the average price of oil will be\n$106.83 a barrel in 2022 and $92.63 a barrel in 2023; that the three-month government bond yield for the United States will average 0.9 percent in 2022 and 2.4 percent in 2023, for the euro area will average –0.7 percent in 2022 and 0.0 percent in 2023, and for Japan will average 0.0 percent in 2022 and 0.1 percen

#### Generative Question-Answering
- [OverallQA](https://langchain.readthedocs.io/en/latest/use_cases/question_answering.html)
- [QA DOC](https://langchain.readthedocs.io/en/latest/modules/indexes/chain_examples/qa_with_sources.html)
- [VectorDB QA](https://langchain.readthedocs.io/en/latest/modules/indexes/chain_examples/vector_db_qa_with_sources.html)


In [122]:
from langchain import OpenAI, VectorDBQA,VectorDBQAWithSourcesChain
# completion llm
llm = OpenAI(
    openai_api_key=os.environ['OPENAI_API_KEY'],
    model_name='gpt-3.5-turbo',
    temperature=0.0
)

qa_with_sources = VectorDBQAWithSourcesChain.from_chain_type(
    llm=llm,
    chain_type="stuff",#only work with small context 
    vectorstore=vectordb,
    k=5,
    return_source_documents=True)


In [123]:
Q = "what is our macro economic forecast for 2022" 
res = qa_with_sources(Q)

In [112]:
res

{'question': 'what is our macro economic forecast for 2022',
 'answer': 'The macroeconomic forecast for 2022 is uncertain, with high inflation expected to persist and downside risks dominating, including from a possible worsening of the war, escalation of sanctions on Russia, a sharper-than-anticipated deceleration in China as a strict zero-COVID strategy is tested by Omicron, and a renewed flare-up of the pandemic should a new, more virulent virus strain emerge. The forecast for emerging market and developing economies is expected to be worse than for advanced economies, with output remaining below pre-pandemic trends throughout the forecast horizon. Specific projections for individual countries are also provided in the sources. \n',
 'sources': '/data/chuang/QA_LangChan/Knowledge_Base/2022_FM_April_0.txt, /data/chuang/QA_LangChan/Knowledge_Base/2022_WEO_April_0.txt',
 'source_documents': [Document(page_content='Estonia: The forecast incorporates the authorities’ approved supplementar

In [117]:
print(tiktoken_len(res['source_documents'][0].page_content))
print(tiktoken_len(res['source_documents'][1].page_content))
print(tiktoken_len(res['source_documents'][2].page_content))

336
371
371


#### Now let's break it down using more customized process

In [167]:
from langchain.chains.qa_with_sources import load_qa_with_sources_chain

- Here, you can do some filtering on your retrieved restults 
- e.g similarity score threshold, do an rerank score etc 

In [213]:
Q = "weo macro-economic forecast in 2022" 
## vector search with "china" as keyword
q_embedding = vectordb._embedding_function.embed_query(Q)
res = vectordb._collection.query(query_embeddings=q_embedding,n_results=3,
                                 where_document={"$or":[
                                                {"$contains":"China"},
                                                {"$contains":"china"}
                                                ]})
res_formated = _results_to_docs_and_scores(res)
docs = [i[0] for i in res_formated]
print(len(docs))
print(docs[0])

3
page_content='United States and Canada: Economic links between Russia and the United States and Canada are limited. Other factors also have a significant impact on the outlook for the two economies. The forecast for the United States was already downgraded in January, largely reflecting non-passage of the Build Back Better fiscal policy package and continued supply chain dis- ruptions. The additional 0.3 percentage point forecast markdown for 2022 in the current round reflects faster withdrawal of monetary support than in the previous projection—as policy tightens to rein in inflation—and the impact of lower growth in trading partners because of disruptions resulting from the war. The forecast for Canada is marked down 0.2 percentage point, reflect- ing the withdrawal of policy support and weaker exter- nal demand from the United States, which outweigh the lift from favorable terms of trade effects.\nLatin America and the Caribbean: With fewer direct connections to Europe, the region

- use loadQAwithSourceChain and pass in retrieved doc as context 

In [214]:
QA_chain = load_qa_with_sources_chain(llm, chain_type="stuff",verbose=True)


- we can see how the staff chain works; see [code](https://github.com/hwchase17/langchain/blob/ff4a25b841f1cca6f4a973067d39dcc5ec2dcf81/langchain/chains/combine_documents/stuff.py#L18) 
- it will first put all context together

In [215]:
temp = QA_chain._get_inputs(docs)
print(temp['summaries'])

Content: United States and Canada: Economic links between Russia and the United States and Canada are limited. Other factors also have a significant impact on the outlook for the two economies. The forecast for the United States was already downgraded in January, largely reflecting non-passage of the Build Back Better fiscal policy package and continued supply chain dis- ruptions. The additional 0.3 percentage point forecast markdown for 2022 in the current round reflects faster withdrawal of monetary support than in the previous projection—as policy tightens to rein in inflation—and the impact of lower growth in trading partners because of disruptions resulting from the war. The forecast for Canada is marked down 0.2 percentage point, reflect- ing the withdrawal of policy support and weaker exter- nal demand from the United States, which outweigh the lift from favorable terms of trade effects.
Latin America and the Caribbean: With fewer direct connections to Europe, the region is also

- then check prompt length. if length > 4k, we need to trunk it donwn, otherwise, it will error out

In [216]:
QA_chain.prompt_length(docs,question=Q)

Token indices sequence length is longer than the specified maximum sequence length for this model (2763 > 1024). Running this sequence through the model will result in indexing errors


2763

In [224]:
answer = QA_chain({"input_documents": docs, "question": Q}, return_only_outputs=False)
print(answer)



[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mGiven the following extracted parts of a long document and a question, create a final answer with references ("SOURCES"). 
If you don't know the answer, just say that you don't know. Don't try to make up an answer.
ALWAYS return a "SOURCES" part in your answer.

QUESTION: Which state/country's law governs the interpretation of the contract?
Content: This Agreement is governed by English law and the parties submit to the exclusive jurisdiction of the English courts in  relation to any dispute (contractual or non-contractual) concerning this Agreement save that either party may apply to any court for an  injunction or other relief to protect its Intellectual Property Rights.
Source: 28-pl
Content: No Waiver. Failure or delay in exercising any right or remedy under this Agreement shall not constitute a waiver of such (or any other)  right or remedy.

11.7 

In [225]:
print(answer['output_text'])

The WEO macro-economic forecast for 2022 is that global growth will be 3.6 percent, with inflation projected at 5.7 percent in advanced economies and 8.7 percent in emerging market and developing economies. The forecast for the United States was downgraded by 0.3 percentage points due to faster withdrawal of monetary support and lower growth in trading partners resulting from the war. The forecast for Canada was marked down by 0.2 percentage points due to the withdrawal of policy support and weaker external demand from the United States. The fluid international situation means that quantitative forecasts are even more uncertain than usual, and downside risks to the global outlook dominate. The war in Ukraine has increased the probability of wider social tensions because of higher food and energy prices, which would further weigh on the outlook. Inflation is expected to remain elevated for longer than in the previous forecast, driven by war-induced commodity price increases and broadeni

- we can try a different chain type - the refine chain
- it will make multiple calls to llm and iteratively refine the answer 

In [226]:
QA_chain = load_qa_with_sources_chain(llm, chain_type="refine",verbose=True)
answer = QA_chain({"input_documents": docs, "question": Q}, return_only_outputs=False)
print(answer)



[1m> Entering new RefineDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mContext information is below. 
---------------------
Content: United States and Canada: Economic links between Russia and the United States and Canada are limited. Other factors also have a significant impact on the outlook for the two economies. The forecast for the United States was already downgraded in January, largely reflecting non-passage of the Build Back Better fiscal policy package and continued supply chain dis- ruptions. The additional 0.3 percentage point forecast markdown for 2022 in the current round reflects faster withdrawal of monetary support than in the previous projection—as policy tightens to rein in inflation—and the impact of lower growth in trading partners because of disruptions resulting from the war. The forecast for Canada is marked down 0.2 percentage point, reflect- ing the withdrawal of policy support and weaker exter- n

- this does look a little better 

In [228]:
print(answer['output_text'])

Refined answer: The 2022 World Economic Outlook (WEO) macro-economic forecast predicts a significant worsening of global economic prospects since the last forecast in January, largely due to Russia's invasion of Ukraine. The war will severely set back the global recovery, slowing growth and increasing inflation even further. The forecast projects global growth at 3.6 percent in 2022 and 2023, which is 0.8 and 0.2 percentage points lower than in the January forecast, respectively. The downgrade largely reflects the war's direct impacts on Russia and Ukraine and global spillovers. Both Russia and Ukraine are projected to experience large GDP contractions in 2022. The severe collapse in Ukraine is a direct result of the invasion, destruction of infrastructure, and exodus of its people. In addition to the war, frequent and wider-ranging lockdowns in China, including in key manufacturing hubs, have also slowed activity there and could cause new bottlenecks in global supply chains. The forec

- try a different questoin 

In [229]:
qa_with_sources = VectorDBQAWithSourcesChain.from_chain_type(
    llm=llm,
    chain_type="refine",#only work with small context 
    vectorstore=vectordb,
    k=5,
    return_source_documents=True)

In [230]:
Q = "what is the assessment on banking sector risks?"
answer = qa_with_sources(Q)

In [233]:
print(answer['answer'])

The new context provides additional information on the risks faced by the banking sector in emerging markets, particularly related to their holdings of domestic sovereign debt. According to the 2022 Global Financial Stability Report and the IPCC Sixth Assessment Report, emerging market banks have significantly increased their holdings of domestic sovereign debt during the COVID-19 pandemic, which on average accounts for about one-fifth of banking sector assets and 200 percent of their regulatory capital. This deepens the ties between the sovereign and banking sectors, creating a sovereign-bank nexus that poses risks of an adverse feedback loop that could threaten macro-financial stability. The transmission of risks between the sovereign and banking sectors is significant, both directly and indirectly through the nonfinancial corporate sector. An increase in sovereign risk can adversely affect banks' balance sheets and lending appetite, especially in countries with less-well-capitalized

### More low level apprach 
- [notebook from James](https://github.com/pinecone-io/examples/blob/master/generation/gpt4-retrieval-augmentation/gpt-4-langchain-docs.ipynb)
- follow gpt3.5-4 api format, define system and user role 