In [15]:
from langchain_community.document_loaders import ArxivLoader

loader = ArxivLoader(query="2405.17147")
docs = loader.load()
len(docs)
docs[0].metadata.keys()


dict_keys(['Published', 'Title', 'Authors', 'Summary'])

In [35]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

chunk_data = RecursiveCharacterTextSplitter(chunk_size= 1000, chunk_overlap = 100).split_documents(docs)
chunk_data

[Document(page_content='CTSOC NEWS ON CONSUMER TECHNOLOGY\n1\nLarge Language Models (LLMs):\nDeployment, Tokenomics and Sustainability\nHaiwei Dong Senior Member, IEEE, Shuang Xie Member, IEEE\nAbstract—The rapid advancement of Large Language Models\n(LLMs) has significantly impacted human-computer interaction,\nepitomized by the release of GPT-4o, which introduced com-\nprehensive multi-modality capabilities. In this paper, we first\nexplored the deployment strategies, economic considerations,\nand sustainability challenges associated with the state-of-the-art\nLLMs. More specifically, we discussed the deployment debate\nbetween Retrieval-Augmented Generation (RAG) and fine-tuning,\nhighlighting their respective advantages and limitations. After\nthat, we quantitatively analyzed the requirement of xPUs in\ntraining and inference. Additionally, for the tokenomics of LLM\nservices, we examined the balance between performance and cost\nfrom the quality of experience (QoE)’s perspective o

In [39]:
from langchain_pinecone import PineconeVectorStore
from pinecone import Pinecone, ServerlessSpec
import os
from dotenv import load_dotenv
load_dotenv()

pinecone_api_key = os.getenv("PINECONE_API_KEY")
index_name = "rag-pc-demo"

pc = Pinecone(api_key=pinecone_api_key)
pc

<pinecone.control.pinecone.Pinecone at 0x297ccaa7ce0>

In [40]:
import time

existing_indexes = [index_info["name"] for index_info in pc.list_indexes()]

if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=1536,
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1")
    )

    while not pc.describe_index(index_name).status["pinecone index is ready"]:
        time.sleep(1)

index = pc.Index(index_name)

In [41]:
from langchain_openai import OpenAIEmbeddings
openai_api_key = os.getenv("OPENAI_API_KEY")
embeddings = OpenAIEmbeddings(api_key=openai_api_key)
embeddings

OpenAIEmbeddings(client=<openai.resources.embeddings.Embeddings object at 0x00000297CCECBAA0>, async_client=<openai.resources.embeddings.AsyncEmbeddings object at 0x00000297CCECB5C0>, model='text-embedding-ada-002', dimensions=None, deployment='text-embedding-ada-002', openai_api_version='', openai_api_base=None, openai_api_type='', openai_proxy='', embedding_ctx_length=8191, openai_api_key=SecretStr('**********'), openai_organization=None, allowed_special=None, disallowed_special=None, chunk_size=1000, max_retries=2, request_timeout=None, headers=None, tiktoken_enabled=True, tiktoken_model_name=None, show_progress_bar=False, model_kwargs={}, skip_empty=False, default_headers=None, default_query=None, retry_min_seconds=4, retry_max_seconds=20, http_client=None, http_async_client=None, check_embedding_ctx_length=True)

In [42]:
vector_store = PineconeVectorStore.from_documents(chunk_data, embeddings, index_name = index_name)

In [144]:
#create a retriever

query = "describe the carbon footprint by gpt 4o."

retriever = vector_store.as_retriever(search_kwargs ={"k":3})
retriever.get_relevant_documents(query)

[Document(page_content='ables researchers and developers to make informed decisions\nregarding model design and training procedures, promoting\nCTSOC NEWS ON CONSUMER TECHNOLOGY\n5\nthe development of more sustainable LLMs. Tools like mlco2\n[20] offer a preliminary assessment based on GPU usage,\nbut they often have limitations, such as being unable to\naccount for dense or mixture-of-experts (MoE) architectures.\nLLMCarbon [21], a recently introduced end-to-end carbon\nfootprint projection model, addresses these shortcomings by\nproviding more comprehensive and nuanced estimations for\nvarious LLM architectures, including dense and MoE models.\nFor instance, LLMCarbon estimates that training a GPT-3\nmodel could generate around 553.87 tCO2eq (tonnes of CO2\nequivalent), compared to actual data, the disparity is only\n+0.32% with the actual emit is 536.69 tCO2eq. However,\nthe training operational carbon footprint estimation made by\nmlco2 is 69% higher than the actual, because mlco2 

In [45]:
doc_and_score = vector_store._similarity_search_with_relevance_scores(query, k=3)
doc_and_score

[(Document(page_content='ables researchers and developers to make informed decisions\nregarding model design and training procedures, promoting\nCTSOC NEWS ON CONSUMER TECHNOLOGY\n5\nthe development of more sustainable LLMs. Tools like mlco2\n[20] offer a preliminary assessment based on GPU usage,\nbut they often have limitations, such as being unable to\naccount for dense or mixture-of-experts (MoE) architectures.\nLLMCarbon [21], a recently introduced end-to-end carbon\nfootprint projection model, addresses these shortcomings by\nproviding more comprehensive and nuanced estimations for\nvarious LLM architectures, including dense and MoE models.\nFor instance, LLMCarbon estimates that training a GPT-3\nmodel could generate around 553.87 tCO2eq (tonnes of CO2\nequivalent), compared to actual data, the disparity is only\n+0.32% with the actual emit is 536.69 tCO2eq. However,\nthe training operational carbon footprint estimation made by\nmlco2 is 69% higher than the actual, because mlco2

In [55]:
#format
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [137]:
query = "describe the carbon footprint by gpt 4o."

retrieved_docs =retriever.invoke(query)
print(format_docs(retrieved_docs))

ables researchers and developers to make informed decisions
regarding model design and training procedures, promoting
CTSOC NEWS ON CONSUMER TECHNOLOGY
5
the development of more sustainable LLMs. Tools like mlco2
[20] offer a preliminary assessment based on GPU usage,
but they often have limitations, such as being unable to
account for dense or mixture-of-experts (MoE) architectures.
LLMCarbon [21], a recently introduced end-to-end carbon
footprint projection model, addresses these shortcomings by
providing more comprehensive and nuanced estimations for
various LLM architectures, including dense and MoE models.
For instance, LLMCarbon estimates that training a GPT-3
model could generate around 553.87 tCO2eq (tonnes of CO2
equivalent), compared to actual data, the disparity is only
+0.32% with the actual emit is 536.69 tCO2eq. However,
the training operational carbon footprint estimation made by
mlco2 is 69% higher than the actual, because mlco2 assumes

in multiple clouds. Between edge

In [138]:
from langchain_openai import ChatOpenAI
llm = ChatOpenAI(api_key=openai_api_key)

In [250]:
#create a prompt

from langchain import hub
prompt = hub.pull("rlm/rag-prompt")

In [255]:
prompt_template = """You are an expert LLM assistant specialized in answering questions related to large language models (LLMs). Use the provided information and your knowledge to respond accurately and clearly to each question. 

Guidelines:
1. Provide concise and informative answers.
2. If the question is beyond the scope of your knowledge or the provided information, state, "I don't know."
3. Use examples where applicable to illustrate your answers.
4. Maintain a professional and helpful tone.

Question: {question} 
Context: {context} 
Answer:
"""

# You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.
# Question: {question} 
# Context: {context} 
# Answer:

question = "What are the primary advantages of using Retrieval-Augmented Generation (RAG)?"
context = "RAG combines retrieval mechanisms with generative models to provide accurate and relevant responses by fetching information from external sources and generating coherent answers."

# Format the prompt with the given question and context
prompt = prompt_template.format(question=question, context=context)


In [265]:
#Creating a customised prompt

from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

template = """You are an expert LLM assistant specialized in answering questions related to large language models (LLMs). Use the provided information and your knowledge to respond accurately and clearly to each question. 

Guidelines:
1. Provide concise and informative answers.
2. If the question is beyond the scope of your knowledge or the provided information, state, "I don't know."
3. Use examples where applicable to illustrate your answers.
4. Maintain a professional and helpful tone.

{context}

Question: {question}

Helpful Answer:
"""
custom_rag_prompt = PromptTemplate.from_template(template)

rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | custom_rag_prompt
    | llm
    | StrOutputParser()
)

In [266]:
query = "what potential applications could arise from gpt-4o expanded functionalities?"
rag_chain.invoke(query)

" The expanded functionalities of GPT-4o, such as multi-modality capabilities, could lead to a wide range of potential applications. Some potential applications could include: \n1. Improved human-computer interaction: With the ability to generate images, videos, and audio based on text descriptions, GPT-4o could greatly enhance the way humans interact with computers, making it more natural and intuitive.\n2. Content creation and storytelling: The text-to-video and text-to-audio functionalities could be utilized for creating content such as videos and audiobooks from text descriptions, making it easier for content creators to produce high-quality and engaging content.\n3. Image and video summarization: The image-to-text and video-to-text functionalities could be used to automatically summarize images and videos, making it easier for users to quickly understand and process visual information.\n4. Virtual assistants and chatbots: GPT-4o's expanded functionalities could greatly improve the

In [269]:
import gradio as gr

def chat_response(message,  memory):
    return rag_chain.invoke(message)

# Define the button instances with labels
retry_button = gr.Button("Retry")
undo_button = gr.Button("Undo")
clear_button = gr.Button("Clear")

# Pass the button instances to the ChatInterface
rag_demo = gr.ChatInterface(
                chat_response,
                chatbot=gr.Chatbot(height=300),
                textbox=gr.Textbox(placeholder="Enter query here", container=False, scale=7),
                title="RAG Demo",
                examples=["How does the introduction of GPT-4o by OpenAI, with its multimodal capabilities, represent a pivotal advancement in the evolution of large language models, and what potential applications could arise from these expanded functionalities?", "What are the primary advantages and challenges of deploying large language models using Retrieval-Augmented Generation (RAG) compared to fine-tuning, particularly in terms of handling specialized knowledge and reducing hallucinations?"],
                cache_examples=False,
                retry_btn=retry_button,
                undo_btn=undo_button,
                clear_btn=clear_button,
                submit_btn="Submit"
            )


In [270]:
rag_demo.launch()

Running on local URL:  http://127.0.0.1:7895

To create a public link, set `share=True` in `launch()`.


