In [None]:
from google.colab import userdata
hf_token = userdata.get('HF_TOKEN')
api_key = userdata.get('GROQ_API_KEY')
pinecone_api_key = userdata.get('PINECONE_API_KEY')

In [None]:
!pip install datasets
!pip install chromadb
!pip install langchain_community
!pip install pypdf
# !pip install faiss-gpu
!pip install faiss-cpu
!pip install langchain_openai
!pip install langchain_groq
!pip install langchain_pinecone

In [None]:
from huggingface_hub import login
from datasets import load_dataset
import chromadb
from tqdm import tqdm

In [None]:
load_dotenv()
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY', 'your-key-if-not-using-env')
os.environ['HF_TOKEN'] = os.getenv('HF_TOKEN', 'your-key-if-not-using-env')

In [None]:
#Login to hugging face
login(hf_token, add_to_git_credential=True)

In [None]:
HF_USER = "ed-donner"
DATASET_NAME = f"{HF_USER}/pricer-data"
dataset = load_dataset(DATASET_NAME)
train = dataset['train']
test = dataset['test']

In [None]:
N_DATA = 1000
data = [train[i]['text'] for i in range(N_DATA) ]

**Chroma DB**

In [None]:
DB = "products_vectorstore"
client = chromadb.PersistentClient(path=DB)

In [None]:
## Creating Collection

collection_name = "products"

existing_collection_names = [collection for collection in client.list_collections()]
if collection_name in existing_collection_names:
    client.delete_collection(collection_name)
    print(f"Deleted existing collection: {collection_name}")

collection = client.create_collection(collection_name)
print(f"Existing collections: {client.list_collections()}")

**Embedding using SentenceTransformer**

https://huggingface.co/sentence-transformers

In [None]:
from sentence_transformers import SentenceTransformer

In [None]:
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
# vector = model.encode(["Well hi there"])[0]

In [None]:
batch_size = 100
for i in tqdm(range(0, len(data), batch_size)):
# for i in range(0, len(data), 10):
    documents = data[i: i+batch_size]
    vectors = model.encode(documents).astype(float).tolist()
    metadatas = [{"len": len(item)} for item in data[i: i+batch_size]]
    ids = [f"doc_{j}" for j in range(i, i+batch_size)]
    collection.add(
        ids=ids,
        documents=documents,
        embeddings=vectors,
        metadatas=metadatas
    )

**Read data from Chroma DB**

In [None]:
DB = "products_vectorstore"
client = chromadb.PersistentClient(path=DB)
print(f"Existing collections: {client.list_collections()}")

collection_name = "products"
collection = client.get_collection(collection_name)

print(f"Collection count: {collection.count()}")
MAXIMUM_DATAPOINTS = 500
result = collection.get(include=['embeddings', 'documents', 'metadatas'], limit=MAXIMUM_DATAPOINTS)

n = 27
print(result['ids'][n])
print(result['documents'][n])
print(result['metadatas'][n])
# print(result['embeddings'][n])

In [None]:
# from sentence_transformers import SentenceTransformer
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

def find_similars(text, n_results=5):
  vector = model.encode([text]).astype(float).tolist()

  full_results = collection.query(query_embeddings=vector, n_results=n_results)

  documents = full_results['documents'][0]
  return documents, full_results

text = "Quadcast HyperX condenser mic for high quality podcasting"
documents, full_results = find_similars(text)
documents

In [None]:
## Guess price with DeepSeek R1
!pip install groq
import groq
from groq import Groq
client = Groq(api_key=api_key)

In [None]:
text = "Quadcast HyperX condenser mic for high quality podcasting"
rag_documents = documents

prompt = """ Guess the price of this item based on the context below
{text}

Context:
{rag_documents}

Answer:
""".format(text=text, rag_documents='\n\n'.join(documents))

chat_completion = client.chat.completions.create(
    messages=[
        {
            "role": "user",
            "content": prompt,
        },
    ],
    model="deepseek-r1-distill-llama-70b",
)

result = chat_completion.choices[0].message.content

In [None]:
from IPython.display import Markdown, display

display(Markdown(result))

**FAISS DB - LOCAL**

In [None]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import CharacterTextSplitter
# from langchain_openai import OpenAIEmbeddings, OpenAI
from langchain_groq import ChatGroq
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.chains.retrieval import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain import hub


#########
### WRITE
#########
pdf_path = "https://rauterberg.employee.id.tue.nl/lecturenotes/DDM110%20CAS/Orwell-1949%201984.pdf"
loader = PyPDFLoader(file_path=pdf_path)
documents = loader.load()
text_splitter = CharacterTextSplitter(
    chunk_size=1000, chunk_overlap=30, separator="\n"
)
docs = text_splitter.split_documents(documents=documents)

# embedding = OpenAIEmbeddings()
embedding = SentenceTransformerEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vectorstore = FAISS.from_documents(docs, embedding)
vectorstore.save_local("faiss_index_react")

In [None]:
#########
### READ
#########
embedding = SentenceTransformerEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
new_vectorstore = FAISS.load_local(
    "faiss_index_react", embedding, allow_dangerous_deserialization=True
)

retrieval_qa_chat_prompt = hub.pull("langchain-ai/retrieval-qa-chat")
chat = ChatGroq(temperature=0, groq_api_key=api_key, model_name="deepseek-r1-distill-llama-70b")
combine_docs_chain = create_stuff_documents_chain(
    chat, retrieval_qa_chat_prompt
)
retrieval_chain = create_retrieval_chain(
    new_vectorstore.as_retriever(), combine_docs_chain
)

res = retrieval_chain.invoke({"input": "Who does O'Brien represent as a historical figure? In the context of the 20th century? what about the 21st century?"})

from IPython.display import Markdown, display
display(Markdown(res['answer']))

**Pinecone**

In [None]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import CharacterTextSplitter
# from langchain_openai import OpenAIEmbeddings, OpenAI
from langchain_groq import ChatGroq
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain_pinecone import PineconeVectorStore
from langchain.chains.retrieval import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain import hub


#########
### WRITE
#########
import os
os.environ["PINECONE_API_KEY"] = pinecone_api_key

pdf_path = "https://rauterberg.employee.id.tue.nl/lecturenotes/DDM110%20CAS/Orwell-1949%201984.pdf"
loader = PyPDFLoader(file_path=pdf_path)
documents = loader.load()
text_splitter = CharacterTextSplitter(
    chunk_size=1000, chunk_overlap=30, separator="\n"
)
docs = text_splitter.split_documents(documents=documents)

# embedding = OpenAIEmbeddings()
embedding = SentenceTransformerEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vector_store = PineconeVectorStore.from_documents(docs, embedding, index_name="orwell-1984")
# vector_store.add_documents(docs)

In [None]:
embedding

In [None]:
#########
### READ
#########
embedding = SentenceTransformerEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
new_vectorstore = PineconeVectorStore(embedding=embedding, index_name="orwell-1984")
# new_vectorstore.as_retriever().get_relevant_documents("Who does O'Brien represent as a historical figure? In the context of the 20th century? what about the 21st century?")
retrieval_qa_chat_prompt = hub.pull("langchain-ai/retrieval-qa-chat")
llm = ChatGroq(temperature=0, groq_api_key=api_key, model_name="deepseek-r1-distill-llama-70b")

In [None]:
combine_docs_chain = create_stuff_documents_chain(
    llm, retrieval_qa_chat_prompt
)
chain = create_retrieval_chain(
    new_vectorstore.as_retriever(), combine_docs_chain
)
res = chain.invoke({"input": "Who does O'Brien represent as a historical figure? In the context of the 20th century? what about the 21st century?"})

from IPython.display import Markdown, display
display(Markdown(res['answer']))

**WEB Search**

In [None]:
from google.colab import userdata
tavily_token = userdata.get('TAVILY_API_KEY')

import os
os.environ["TAVILY_API_KEY"] = tavily_token

from langchain_community.tools.tavily_search import TavilySearchResults
question = "Latest political news"
web_search_tool = TavilySearchResults(k=3)
docs = web_search_tool.invoke({"query": question})

# !pip install tavily-python
# from tavily import TavilyClient
# question = "Latest political news"
# client = TavilyClient(api_key=tavily_token)
# response = client.search(question)
# print(response['results'][0].keys())
# print(response['results'][0])


In [48]:
# !pip install langchain-community duckduckgo-search
from langchain_community.tools import DuckDuckGoSearchResults
from langchain.schema import Document
import json
search = DuckDuckGoSearchResults(output_format="json")
question = "Latest political news in the UK"
result = search.invoke(question)
result_string = '\n\n'.join('\n'.join(f"{key}: {value}" for key, value in doc.items()) for doc in json.loads(result))
result_string

'snippet: News UK UK Politics UK politics live: Reeves urges Europe to \'step up\' defence spending as PM vows 2.5% commitment UK will increase defence spending to 2.5 per cent by 2027\ntitle: UK politics live: Reeves urges Europe to \'step up\' defence spending as ...\nlink: https://www.independent.co.uk/news/uk/politics/starmer-defence-foreign-aid-ukraine-reeves-trump-latest-news-b2704688.html\n\nsnippet: It\'s not productive," Hegseth said on "Fox News Sunday." Delegations from the United States, left, and Russia, right, meet in Riyadh, Saudi Arabia, on February 18.\ntitle: French, UK leaders rush to Washington at a historic turning point ... - CNN\nlink: https://www.cnn.com/2025/02/24/politics/trump-macron-starmer-ukraine-analysis/index.html\n\nsnippet: Reuters.com is your online source for the latest UK news stories and current events, ensuring our readers up to date with any breaking news developments\ntitle: UK News | Top Stories from the UK | Reuters\nlink: https://www.reuters.