In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from langchain_core.documents import Document
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_community.vectorstores.utils import DistanceStrategy
from langchain.retrievers.document_compressors.cohere_rerank import CohereRerank
from functools import partial

from state import State
from langgraph.graph import StateGraph, END

from langchain_openai import ChatOpenAI
import os
from getpass import getpass

In [3]:
os.environ["OPENAI_API_KEY"] = getpass("OPENAI API Key:")
os.environ["COHERE_API_KEY"] = getpass("COHERE API Key:")

In [4]:
query = "What are the benefits of using a mixture of experts model?"

In [5]:
from langchain import PromptTemplate

template = """
As an AI Engineer Expert at Mistral Company, specializing in developing cutting-edge AI technology for developers, 
your task is to analyze and offer comprehensive insights on user inquiries.
Ensure that your final responses are user-friendly and formatted as posts.
If the question cannot be answered using the information provided answer
with "I don't know".

Question: {query}

Answer: """

prompt_template = PromptTemplate(
    input_variables=["query"],
    template=template
)

In [6]:
gpt35turbo_llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0.7, openai_api_key=os.environ["OPENAI_API_KEY"])

In [7]:
chain = prompt_template | gpt35turbo_llm
chain.invoke({"query": query})  

AIMessage(content="Using a mixture of experts model can offer several benefits, such as improved accuracy and performance in predicting outcomes. This model combines the strengths of multiple experts, allowing for more robust and well-rounded predictions. Additionally, it can help handle complex data patterns and provide more reliable results. Overall, utilizing a mixture of experts model can enhance the overall AI system's performance and provide more accurate predictions.", response_metadata={'finish_reason': 'stop', 'logprobs': None})

In [8]:

documents = [
    Document(page_content="title=Au Large, summary= Mistral Large is our flagship model, with top-tier reasoning capacities. It is also available on Azure.", metadata = {"url":"https://mistral.ai/news/mistral-large/"}),
    Document(page_content="title=Le Chat, summary=Our assistant is now in beta access, demonstrating what can be built with our technology.", metadata = {"url":"https://mistral.ai/news/le-chat-mistral/"}),
    Document(page_content="title=Mixtral of experts, summary=A high quality Sparse Mixture-of-Experts. ", metadata = {"url":"https://mistral.ai/news/mixtral-of-experts/"}),
    Document(page_content="title=Mistral 7B, summary=The best 7B model to date, Apache 2.0", metadata = {"url":"https://mistral.ai/news/announcing-mistral-7b/"}),
    Document(page_content="title=Bringing open AI models to the frontier, summary=Why we're building Mistral AI.", metadata = {"url":"https://mistral.ai/news/about-mistral-ai/"}),
]

In [9]:

e5_embedding_model = HuggingFaceEmbeddings(model_name="intfloat/multilingual-e5-large")
bge_embedding_model = HuggingFaceEmbeddings(model_name="BAAI/bge-large-en-v1.5")

index_retriever = FAISS.from_documents(documents, e5_embedding_model, distance_strategy=DistanceStrategy.COSINE)
web_retriever = partial(FAISS.from_documents, embedding=bge_embedding_model, distance_strategy=DistanceStrategy.COSINE)

#from ragatouille import RAGPretrainedModel
#colbert_reranker_model = RAGPretrainedModel.from_pretrained(pretrained_model_name_or_path="colbert-ir/colbertv2.0", n_gpu=1)

cohere_reranker_model = CohereRerank()

In [14]:
from agents import index_retriever_agent, rag_website_agent, rerank_agent, expert_mistral_agent

In [15]:
index_retriever_agent_partial = partial(index_retriever_agent, retriever=index_retriever, k=1)
rag_website_agent_partial = partial(rag_website_agent, retriever=web_retriever, k=10)
rerank_agent_partial = partial(rerank_agent, reranker=cohere_reranker_model, k=3)
expert_mistral_agent_partial = partial(expert_mistral_agent, llm=gpt35turbo_llm)

In [16]:
flow = StateGraph(State)
flow.add_node("index_retriever_agent", index_retriever_agent_partial)
flow.add_node("rag_website_agent", rag_website_agent_partial)
flow.add_edge("index_retriever_agent", "rag_website_agent")
flow.add_node("rerank_agent", rerank_agent_partial)
flow.add_edge("rag_website_agent", "rerank_agent")
flow.add_node("generate", expert_mistral_agent_partial)
flow.add_edge("rerank_agent", "generate")
flow.add_edge("generate", END)
flow.set_entry_point("index_retriever_agent")
app = flow.compile()

In [17]:
from pprint import pprint
# Iterate over the stream of outputs
for output in app.stream(State(query=query)):
    # Each output is a dictionary where keys are node names and values are outputs
    for node, state_output in output.items():
        # Print the node name
        pprint(node)
        pprint(state_output)

'index_retriever_agent'
{'documents': [Document(page_content='title=Mixtral of experts, summary=A high quality Sparse Mixture-of-Experts. ', metadata={'url': 'https://mistral.ai/news/mixtral-of-experts/'})],
 'generation': None,
 'query': 'What are the benefits of using a mixture of experts model?'}
'rag_website_agent'
{'documents': [Document(page_content='Mixtral of experts | Mistral AI | Frontier AI in your hands', metadata={'source': 'https://mistral.ai/news/mixtral-of-experts/', 'title': 'Mixtral of experts | Mistral AI | Frontier AI in your hands', 'description': 'A high quality Sparse Mixture-of-Experts.', 'language': 'en-us'}),
               Document(page_content='a high-quality sparse mixture of experts model (SMoE) with open weights. Licensed under Apache 2.0. Mixtral outperforms Llama 2 70B on most benchmarks with 6x faster inference. It is the strongest open-weight model with a permissive license and the best model overall regarding cost/performance trade-offs. In particula

In [18]:
output = output["__end__"]
pprint(output)

{'documents': [Document(page_content='following capabilities.It gracefully handles a context of 32k tokens.It handles English, French, Italian, German and Spanish.It shows strong performance in code generation.It can be finetuned into an instruction-following model that achieves a score of 8.3 on MT-Bench.Pushing the frontier of open models with sparse architecturesMixtral is a sparse mixture-of-experts network. It is a decoder-only model where the feedforward block picks from a set of 8 distinct groups of parameters. At every', metadata={'source': 'https://mistral.ai/news/mixtral-of-experts/', 'title': 'Mixtral of experts | Mistral AI | Frontier AI in your hands', 'description': 'A high quality Sparse Mixture-of-Experts.', 'language': 'en-us'}),
               Document(page_content='a high-quality sparse mixture of experts model (SMoE) with open weights. Licensed under Apache 2.0. Mixtral outperforms Llama 2 70B on most benchmarks with 6x faster inference. It is the strongest open-wei

In [19]:
os.environ["GOOGLE_API_KEY"] = getpass("GOOGLE API Key:")

In [20]:
from deepeval.metrics import AnswerRelevancyMetric
from deepeval.test_case import LLMTestCase
from eval import GoogleGenerativeAIModel

#output = output["__end__"]

answer_relevancy_metric = AnswerRelevancyMetric(threshold=0.7, model=GoogleGenerativeAIModel())

test_case = LLMTestCase(
    input=output["query"],
    # Replace this with the actual output from your LLM application
    actual_output=output["generation"],
    retrieval_context=[doc.page_content for doc in output["documents"]]
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [21]:
answer_relevancy_metric.measure(test_case)
print(answer_relevancy_metric.score)
# Most metrics also offer an explanation
print(answer_relevancy_metric.reason)


Output()

1.0
The score is 1.00 because there are no irrelevant statements in the actual output.
