In [None]:
!pip install langchain_community
!pip install langchain
%pip install --upgrade --quiet unstructured
%pip install --upgrade --quiet chromadb
%pip install --upgrade --quiet  tiktoken
!pip install chromadb

In [None]:
!pip install sentence-transformers

In [None]:
!pip install openai

Great articles for reading:
- https://medium.com/@callumjmac/implementing-rag-in-langchain-with-chroma-a-step-by-step-guide-16fc21815339

# This project is to build a RAG pipeline to answer questions from html

- Framework: explore how to implement RAG in LangChain


Process:
1. Get raw data source
2. Extract information
3. Text Chunking
4. Embedding
5. Save in VectorDB
6. Query and get related doc
7. Prompt - Give the relevant context and query to an LLM

In [None]:
# Langchain dependencies
from langchain.document_loaders.pdf import PyPDFDirectoryLoader # Importing PDF loader from Langchain
from langchain.text_splitter import RecursiveCharacterTextSplitter # Importing text splitter from Langchain
from langchain.embeddings import OpenAIEmbeddings # Importing OpenAI embeddings from Langchain
from langchain.schema import Document # Importing Document schema from Langchain
from langchain.vectorstores.chroma import Chroma # Importing Chroma vector store from Langchain
from langchain.chat_models import ChatOpenAI # Import OpenAI LLM
from langchain_community.document_loaders.url import UnstructuredURLLoader
from langchain_community.document_loaders.html import UnstructuredHTMLLoader
from langchain_text_splitters import CharacterTextSplitter
#from langchain_chroma import Chroma

from langchain.embeddings import HuggingFaceEmbeddings
from langchain_community.embeddings.sentence_transformer import (
    SentenceTransformerEmbeddings,
)
# for vectordb
import chromadb

import openai
import os

Here I'll try unstructured data - url first. Go through 1 html page from news and answer several questions

In [None]:
# reference: https://python.langchain.com/v0.2/docs/integrations/document_loaders/url/
# https://python.langchain.com/v0.2/docs/integrations/vectorstores/chroma/

In [None]:
urls = ['https://themarketswatch.com/business/gold-demand-to-hit-record-due-to-central-bank-buying-fed-rate-cuts/?gad_source=1&gclid=Cj0KCQjwtZK1BhDuARIsAAy2VzuF7rh3REt5IIHCY2QTCkjE98KgYlQ30GupFhRg1kD05mmNxhZ5PO4aAhFZEALw_wcB']
# this is for gold price
url_loader = UnstructuredURLLoader(urls=urls)
url_documents = url_loader.load()
url_documents[0]

Document(metadata={'source': 'https://themarketswatch.com/business/gold-demand-to-hit-record-due-to-central-bank-buying-fed-rate-cuts/?gad_source=1&gclid=Cj0KCQjwtZK1BhDuARIsAAy2VzuF7rh3REt5IIHCY2QTCkjE98KgYlQ30GupFhRg1kD05mmNxhZ5PO4aAhFZEALw_wcB'}, page_content="Home\n\nMarkets\n\nBusiness\n\nInvesting\n\nEconomy\n\nTech\n\nPersonal Finance\n\nPolitics\n\nReal Estate\n\nRetirement\n\nWorld\n\nClose\n\nToday: 28. Jul, 2024\n\nHome\n\nMarkets\n\nBusiness\n\nInvesting\n\nEconomy\n\nTech\n\nPersonal Finance\n\nPolitics\n\nReal Estate\n\nRetirement\n\nWorld\n\nBusiness·Investing\n\nRecord-High Gold Demand Has Just Begun, How Investors Can Still Get In Early\n\nJuly 25, 2024\n\nTravis Grant\n\n3 mins read\n\nGold prices have been rising for over twenty years, with total gold demand hitting a record last year. 2024 is forecast to be another record year. Interest rates have recently caused short-term upward pressure on gold. The long-term shift to gold has started with BRICS countries settlin

2. Text Chunking

- The most important thing to consider when deciding a chunking strategy is the structure of the documents that you are loading into your vector database. If the documents contain similar-length paragraphs, it would be useful to consider this when determining the size of the chunk.

Below is just test, However,
- When deploying these systems to real-world applications, it is important to plot distributions of text lengths in your data, and tune these parameters based on experimentation of parameters such as `chunk_size` and `chunk_overlap`.

In [None]:
# reference: https://medium.com/@bijit211987/chunking-strategies-for-fine-tuning-llms-30d2988c3b7a

In [None]:
text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size=500, # if chunk size too large, will get more noisy info
    chunk_overlap=100,
    length_function=len,
    is_separator_regex=True, # Flag to add start index to each chunk
  )

# text_splitter = SentenceSplitter(
#  separator=" ",
#  chunk_size=1024,
#  chunk_overlap=20,
#  paragraph_separator="\n\n\n",
#  secondary_chunking_regex="[^,.;。]+[,.;。]?",
#  tokenizer=tiktoken.encoding_for_model("gpt-3.5-turbo").encode
#)

# Split documents into smaller chunks using text splitter
chunks = text_splitter.split_documents(url_documents)
print(f"Split {len(url_documents)} documents into {len(chunks)} chunks.")
chunks[1].page_content
chunk_text = [chunk.page_content for chunk in chunks]


Split 1 documents into 24 chunks.


3. Embedding and Choose vectorDB
- Here I'll choose Chroma as vectorDB
- And for embedding, i will use some open-source modelling as they are for free


In [None]:
# reference: https://docs.trychroma.com/getting-started
# https://docs.trychroma.com/guides/embeddings

In [None]:
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
chunk_embeddings = embeddings.embed_documents(chunk_text)
chunk_embeddings

  warn_deprecated(
  from tqdm.autonotebook import tqdm, trange
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

[[0.07507748901844025,
  -0.06166299059987068,
  -0.06970856338739395,
  0.005123568698763847,
  0.07033701241016388,
  0.023548109456896782,
  0.06192732974886894,
  -0.02391689270734787,
  -0.05200165882706642,
  -0.037378497421741486,
  -0.11664356291294098,
  0.04423775151371956,
  -0.0052614156156778336,
  -0.006914015859365463,
  0.08127841353416443,
  0.0019922961946576834,
  -0.037674274295568466,
  -0.08634399622678757,
  0.005281098186969757,
  0.024466371163725853,
  -0.04025563970208168,
  -0.02523810602724552,
  -0.07134844362735748,
  0.011835300363600254,
  0.05827829986810684,
  0.02309352532029152,
  0.045978844165802,
  -0.0014070095494389534,
  -0.046777378767728806,
  0.01189106423407793,
  -0.04334048926830292,
  0.030531305819749832,
  -0.019528962671756744,
  0.017672812566161156,
  0.0014002423267811537,
  0.004604787100106478,
  -0.08609015494585037,
  0.04587728902697563,
  0.08662917464971542,
  -0.020509304478764534,
  0.02147270180284977,
  -0.1213069483637

- Store in Chroma

In [None]:
chroma_client = chromadb.Client()
collection = chroma_client.get_or_create_collection(name='gold_news')

In [None]:
ids = [str(i) for i in range(len(chunk_embeddings))]
embeddings_with_ids = list(zip(ids, chunk_embeddings))

#embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

collection.add(
    documents=chunk_text,
    embeddings=chunk_embeddings,
    ids=ids
)


In [None]:
from pprint import pprint

query = "Will gold price decrease?"
query_embedding = embeddings.embed_query(query)

results = collection.query(query_embeddings=[query_embedding], n_results=5)
pprint(results)

{'data': None,
 'distances': [[0.6557489633560181,
                0.6561487913131714,
                0.6767497062683105,
                0.7488464713096619,
                0.7668339014053345]],
 'documents': [['Gold is widely regarded as a safe-haven asset, but long-term '
                'global trends are causing it to grow like a growth stock. '
                'Central banks are concerned about the perceived value of '
                'their currencies and the security of their trade, prompting '
                'them to intervene in the gold market and buy gold while it’s '
                'on sale. Their intervention and purchases have driven the '
                'price up over 20% in the past year, reaching an all-time high '
                'of $2450 per ounce in May.',
                'Gold prices have been rising for over twenty years, with '
                'total gold demand hitting a record last year. 2024 is '
                'forecast to be another record year. Inter

In [None]:
context = results['documents'][0]
context_text = ''

for x in context:
  context_text += ''.join(x)

context_text

'Gold is widely regarded as a safe-haven asset, but long-term global trends are causing it to grow like a growth stock. Central banks are concerned about the perceived value of their currencies and the security of their trade, prompting them to intervene in the gold market and buy gold while it’s on sale. Their intervention and purchases have driven the price up over 20% in the past year, reaching an all-time high of $2450 per ounce in May.Gold prices have been rising for over twenty years, with total gold demand hitting a record last year. 2024 is forecast to be another record year. Interest rates have recently caused short-term upward pressure on gold. The long-term shift to gold has started with BRICS countries settling on using gold as the basis for international exchange and central bank interest cuts. As gold prices soar, gold mining stocks are just starting to catch up, giving investors a viable opportunity to profit.According to a recent World Gold Council (WGC) survey involvin

1. Use QA pipeline

In [None]:
from transformers import pipeline

model = "bert-large-uncased-whole-word-masking-finetuned-squad"

qa_pipeline = pipeline("question-answering", model=model) # maybe we can customize a model in the future

query = "Will gold price decrease?"

qa_pipeline(context = context_text, question = query)['answer']


Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


'no banks expect a decline in gold purchases'

In [None]:
!pip install --upgrade openai

2. Try Prompt

In [None]:
api_key = os.getenv("OPENAI_API_KEY")

In [None]:
from openai import OpenAI
client = OpenAI(api_key=api_key)

question = query


prompt = f"Based on the following context, please answer my question as a human:\n\n{context_text}\n\nQuestion: {question}"

response = client.chat.completions.create(
    model="gpt-4o-mini",
    messages=[
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": prompt}
    ],
    max_tokens=150,
    n=1,
    stop=None,
    temperature=0.7,
)


In [46]:
pprint(response.choices[0].message.content)

('Based on the context provided, it seems unlikely that gold prices will '
 'decrease in the near term. The outlook is quite bullish for gold, with '
 'central banks continuing to purchase gold, expectations of rising demand, '
 'and significant forecasts from major financial institutions predicting '
 'further increases in gold prices. With central banks actively buying gold '
 'and no expectation of a decline in purchases, along with a historical trend '
 'of rising prices, the conditions appear favorable for continued growth '
 "rather than a decrease. Therefore, while it's impossible to predict market "
 'movements with certainty, the current trends suggest that gold prices are '
 'more likely to increase than decrease in the near future.')


# FOR MY FUTURE STEP - I WANNA USE OPENAI API TO MAKE THE ANSWER MORE ACCURATE AND HUMAN-LIKE

RESULT IS NOT WHAT I WANT - from transformers import RagTokenizer, RagSequenceForGeneration

In [None]:
#you can combine both retrieval-based and generative-based approaches effectively.