In [1]:
from langchain_community.document_loaders import ArxivLoader

loader=ArxivLoader(query="2405.17147")
docs = loader.load()
len(docs)

1

In [2]:
#Chunk data
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1000, chunk_overlap =100)
chunk_docs = text_splitter.split_documents(docs)
len(chunk_docs)


32

In [3]:
#Create embeddings
from langchain_openai import OpenAIEmbeddings
import os
from dotenv import load_dotenv
load_dotenv()

openai_api_key = os.getenv("OPENAI_API_KEY")
pinecone_api_key = os.getenv("PINECONE_API_KEY")

embeddings = OpenAIEmbeddings(api_key=openai_api_key)

In [4]:
from pinecone import Pinecone, ServerlessSpec

pc = Pinecone(api_key=pinecone_api_key)
index_name = "pc-rag"

if index_name not in pc.list_indexes().names():
    pc.create_index(
        name= index_name,
        dimension=1536,
        metric = "cosine",
        spec= ServerlessSpec(cloud="aws", region= "us-east-1")
    )

  from tqdm.autonotebook import tqdm


In [5]:
#Upsert data
from langchain_pinecone import PineconeVectorStore

vector_store = PineconeVectorStore.from_documents(chunk_docs, embeddings, index_name = index_name)

In [6]:
query = "What metrics are used to evaluate the quality of experience (QoE) for users of large language model (LLM) services?"

retriever = vector_store.as_retriever(search_kwargs = {"k":3})
retriever.get_relevant_documents(query)

  warn_deprecated(


[]

In [7]:
def format_docs(query):
    return "\n\n".join(doc.page_content for doc in docs)

retrieved_docs = retriever.invoke(query)


In [8]:
template = """"You are an expert LLM assistant specialized in answering questions related to large language models (LLMs). Use the provided information and your knowledge to respond accurately and clearly to each question. 

Guidelines:
1. Provide concise and informative answers.
2. If the question is beyond the scope of your knowledge or the provided information, state, "I don't know."
3. Use examples where applicable to illustrate your answers.
4. Maintain a professional and helpful tone.

Context: {context}
Question: {question}

Answer:

"""

In [9]:
question = query
context = format_docs(retrieved_docs)

prompt = template.format(context = context, question = question)
print(prompt)

"You are an expert LLM assistant specialized in answering questions related to large language models (LLMs). Use the provided information and your knowledge to respond accurately and clearly to each question. 

Guidelines:
1. Provide concise and informative answers.
2. If the question is beyond the scope of your knowledge or the provided information, state, "I don't know."
3. Use examples where applicable to illustrate your answers.
4. Maintain a professional and helpful tone.

Context: CTSOC NEWS ON CONSUMER TECHNOLOGY
1
Large Language Models (LLMs):
Deployment, Tokenomics and Sustainability
Haiwei Dong Senior Member, IEEE, Shuang Xie Member, IEEE
Abstract—The rapid advancement of Large Language Models
(LLMs) has significantly impacted human-computer interaction,
epitomized by the release of GPT-4o, which introduced com-
prehensive multi-modality capabilities. In this paper, we first
explored the deployment strategies, economic considerations,
and sustainability challenges associated 

In [10]:
#rag chain
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import ChatOpenAI


llm = ChatOpenAI(api_key=openai_api_key)
custom_rag_template = PromptTemplate.from_template(template)


rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | custom_rag_template
    | llm
    | StrOutputParser()
)


In [None]:
rag_chain.invoke(query)

In [122]:
#Create contextualised prompt
from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder

contexttualised_system_prompt = (
    "Given a chat history and the latest user question "
    "which might reference context in the chat history, "
    "formulate a standalone question which can be understood "
    "without the chat history. Do NOT answer the question, "
    "just reformulate it if needed and otherwise return it as is."
)

contextualised_template=ChatPromptTemplate.from_messages(
    [
        ("system", contexttualised_system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}")
    ]
)

In [121]:
#create history aware retriever
from langchain.chains import create_history_aware_retriever

history_aware_retriever = create_history_aware_retriever(llm, retriever, contextualised_template)

In [124]:
#create system prompt

system_prompt_template = """"You are an expert LLM assistant specialized in answering questions related to large language models (LLMs). Use the provided information and your knowledge to respond accurately and clearly to each question. 

Guidelines:
1. Provide concise and informative answers.
2. If the question is beyond the scope of your knowledge or the provided information, state, "I don't know."
3. Use examples where applicable to illustrate your answers.
4. Maintain a professional and helpful tone.

Context: {context}

"""

system_template=ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt_template),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}")
    ]
)


In [126]:
#create qa chain and rag chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain

question_answer_chain = create_stuff_documents_chain(llm, system_template)

rag_chain = create_retrieval_chain(history_aware_retriever, question_answer_chain)

In [128]:
#Manage chat_history
from langchain_core.chat_history import BaseChatMessageHistory
from langchain_community.chat_message_histories import ChatMessageHistory
from langchain_core.runnables.history import RunnableWithMessageHistory

store = {}

def get_session_history(session_id: str) -> BaseChatMessageHistory:
    if session_id not in store:
        store[session_id] = ChatMessageHistory()
    
    return store[session_id]

conversational_rag_chain = RunnableWithMessageHistory(
    rag_chain,
    get_session_history,
    input_messages_key= "input",
    history_messages_key="chat_history",
    output_messages_key="answer"
)

In [130]:
# import pprint

# def coversational_chain(query):
#     answer = conversational_rag_chain.invoke(
#         {"input": query},
#         config={
#             "configurable" : {"session_id", "my_session"}
#         }
#     )
#     pprint.pprint(answer)
#     return answer

In [131]:
conversational_chain("What is QoE?")

{'answer': 'QoE stands for Quality of Experience, which refers to the overall '
           'satisfaction and usability of a service or product as perceived by '
           'end users. In the context of large language models (LLMs), QoE '
           'encompasses factors like inference accuracy, latency, device '
           'capacity, privacy, and security. Maintaining a high QoE is '
           'essential for ensuring that users have a positive and seamless '
           'interaction with LLM services.',
 'chat_history': [],
 'context': [Document(page_content='from the quality of experience (QoE)’s perspective of end users.\nLastly, we envisioned the future hybrid architecture of LLM\nprocessing and its corresponding sustainability concerns, partic-\nularly in the environmental carbon footprint impact. Through\nthese discussions, we provided a comprehensive overview of\nthe operational and strategic considerations essential for the\nresponsible development and deployment of LLMs.\nI. UBI

{'input': 'What is QoE?',
 'chat_history': [],
 'context': [Document(page_content='from the quality of experience (QoE)’s perspective of end users.\nLastly, we envisioned the future hybrid architecture of LLM\nprocessing and its corresponding sustainability concerns, partic-\nularly in the environmental carbon footprint impact. Through\nthese discussions, we provided a comprehensive overview of\nthe operational and strategic considerations essential for the\nresponsible development and deployment of LLMs.\nI. UBIQUITOUS LLMS\nThe recent unveiling of GPT-4o by OpenAI on May 13,\n2024 marks a pivotal moment in the evolution of large\nlanguage models (LLMs) [1]. This groundbreaking model,\naptly named with “o” signifying “omni” for its comprehensive\ncapabilities, transcends the limitations of its predecessors by\nincorporating multi modality. This signifies a significant step\ntowards achieving more natural and intuitive human-computer\ninteraction.\nThe emergence of LLMs started from th

In [132]:
conversational_chain("how is it used in model training?")

{'answer': 'In model training, the concept of Quality of Experience (QoE) can '
           'be used as a guiding principle to optimize the performance and '
           'efficiency of large language models (LLMs). By considering QoE '
           'during training, developers can focus on improving factors that '
           'directly impact user satisfaction, such as speed, accuracy, and '
           'cost-effectiveness.\n'
           '\n'
           'For instance, when training LLMs, developers may prioritize '
           "enhancing the model's ability to generate accurate and relevant "
           'responses within a reasonable time frame (latency). This focus on '
           'QoE can lead to models that provide more useful and timely '
           'information to users, ultimately enhancing their overall '
           'experience.\n'
           '\n'
           'Moreover, by balancing performance improvements with cost '
           'considerations, developers can ensure that the trained m

{'input': 'how is it used in model training?',
 'chat_history': [HumanMessage(content='What is QoE?'),
  AIMessage(content='QoE stands for Quality of Experience, which refers to the overall satisfaction and usability of a service or product as perceived by end users. In the context of large language models (LLMs), QoE encompasses factors like inference accuracy, latency, device capacity, privacy, and security. Maintaining a high QoE is essential for ensuring that users have a positive and seamless interaction with LLM services.')],
 'context': [Document(page_content='CTSOC NEWS ON CONSUMER TECHNOLOGY\n1\nLarge Language Models (LLMs):\nDeployment, Tokenomics and Sustainability\nHaiwei Dong Senior Member, IEEE, Shuang Xie Member, IEEE\nAbstract—The rapid advancement of Large Language Models\n(LLMs) has significantly impacted human-computer interaction,\nepitomized by the release of GPT-4o, which introduced com-\nprehensive multi-modality capabilities. In this paper, we first\nexplored th

In [145]:
import pprint

def conversational_chain(query):
    answer = conversational_rag_chain.invoke(
        {"input": query},
        config={
            "configurable": {"session_id": "my_session00001"}
        }
    )
    pprint.pprint(answer)
    return answer


In [146]:
conversational_chain("Tell me about tokenomics")

{'answer': 'Tokenomics refers to the analysis of generative tokens in Large '
           'Language Models (LLMs) inference from an economic perspective. It '
           'involves considering two key aspects: throughput (tokens per '
           'second) and price (USD per 1 million tokens). Throughput measures '
           'how many tokens an LLM can generate in a second, while price '
           'reflects the cost of generating a certain number of tokens.\n'
           '\n'
           'For example, if a company can produce 100 tokens per second at a '
           'cost of $0.5 USD per million tokens, its tokenomics would involve '
           'both the efficiency of token generation (throughput) and the '
           'cost-effectiveness of producing tokens (price).',
 'chat_history': [],
 'context': [Document(page_content='cost-effective yet usable CPUs for the inference business.\nIV. TOKENOMICS VS. QUALITY OF EXPERIENCE: THE\nCOMPROMISE OF PERFORMANCE AND COST\nTokenomics is a compound 

{'input': 'Tell me about tokenomics',
 'chat_history': [],
 'context': [Document(page_content='cost-effective yet usable CPUs for the inference business.\nIV. TOKENOMICS VS. QUALITY OF EXPERIENCE: THE\nCOMPROMISE OF PERFORMANCE AND COST\nTokenomics is a compound word for token and economics\nreferring to the analysis of generative tokens in LLM infer-\nence from the perspective of economics. Here, we usually\nconsider two aspects: throughput (tokens per second) and\nprice (USD per 1 million tokens). According to the up-to-\ndate data [13], the throughput of most unicorn companies\nCTSOC NEWS ON CONSUMER TECHNOLOGY\n4\nFig. 4.\nThe hybrid architecture of AI processing of LLMs. The central and edge clouds and devices work together to deliver high QoE LLM service by\nbalancing factors, including inference accuracy, latency, device capacity, privacy, and security.\n(such as Mistral, Perplexity, Toghether.ai, Anyscale, Deepinfra,\nFireworks, Groq, Leption) lies about 50 to 200 tokens per\ns

In [147]:


def conversational_chain(query):
    answer = conversational_rag_chain.invoke(
        {"input": query},
        config={
            "configurable": {"session_id": "my_session00001"}
        }
    )['answer']
    
    return answer

In [148]:
conversational_chain("How is it related to llm carbon emission")

'Tokenomics in Large Language Models (LLMs) is related to carbon emissions through the operational footprint of LLMs. The operational footprint includes emissions from the energy consumption of hardware during pre-training, fine-tuning, and inference, which are all part of the token generation process in LLMs.\n\nWhen optimizing tokenomics in LLMs, developers and researchers aim to strike a balance between performance and cost efficiency. By understanding the relationship between token generation (throughput and price) and energy consumption, they can make more environmentally conscious decisions to reduce the carbon footprint associated with operating LLMs.\n\nFor instance, by optimizing token generation efficiency, developers can potentially decrease the energy consumption required for LLM operations, thereby reducing the overall carbon emissions associated with running these models.'

In [149]:
import gradio as gr

def conversational_rag(query, context=None):
    return conversational_rag_chain.invoke(
    {"input": query},
    config={
        "configurable": {"session_id": "abc123"}
    },)["answer"]

rag_demo = gr.ChatInterface(
    conversational_rag,
    title="RAG demo",
    chatbot=gr.Chatbot(height= 300),
    textbox= gr.Textbox(placeholder="Enter query here", scale=5),
    examples=["How does the introduction of GPT-4o by OpenAI, represent a pivotal advancement in the evolution of large language models, and what potential applications could arise from these expanded functionalities?", "What are the primary advantages and challenges of deploying LLMs using RAG compared to fine-tuning, particularly in terms of handling specialized knowledge and reducing hallucinations?"],
    clear_btn=gr.Button("Clear"),
    undo_btn=gr.Button("Undo"),
    retry_btn=gr.Button("Retry"),
    submit_btn=gr.Button("Submit")
)

In [150]:
rag_demo.launch()

Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.


