In [1]:
%%capture --no-stderr
%pip install --quiet -U langgraph langchain-anthropic

### LLM

Set `ANTHROPIC_API_KEY` and load LLM.

In [None]:
import getpass

def _set_env(var: str):
    if not os.environ.get(var):
        os.environ[var] = getpass.getpass(f"{var}: ")

_set_env("ANTHROPIC_API_KEY")

Ensure `langchain-anthropic>=0.1.23` for caching. 

In [1]:
from langchain_anthropic import ChatAnthropic
llm = ChatAnthropic(model="claude-3-5-sonnet-20240620", 
                    extra_headers={"anthropic-beta":"prompt-caching-2024-07-31"}) 

                extra_headers was transferred to model_kwargs.
                Please confirm that extra_headers is what you intended.


### State

Let's define state for for graph.

In [2]:
import operator
from typing import List, Optional
from typing import Annotated
from typing_extensions import TypedDict
from langchain_core.messages import AnyMessage
from langgraph.graph.message import add_messages

class InterviewState(TypedDict):
    topic: str
    messages: Annotated[List[AnyMessage], add_messages]
    analyst: str
    editor_feedback: str
    interviews: list
    reports: list 

### Question Generation

In [12]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.prompts import MessagesPlaceholder
from langchain_core.runnables import chain as as_runnable

gen_qn_prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            
            """You are an analyst tasked with interviewing an expert to learn about a specific topic. 

            Your goal is boil down to interesting and specific insights related to your topic.

            1. Interesting: Insights that people will find surprising or non-obvious.
            
            2. Specific: Insights that avoid generalities and include specific examples from the expert.
    
            Here is your topic of focus and set of goals: {persona}
            
            Begin by introducing yourself using a name that fits your persona, and then ask your question.

            Continue to ask questions to drill down and refine your understanding of the topic.
            
            When you are satisfied with your understanding, complete the interview with: "Thank you so much for your help!"

            Remember to stay in character throughout your response, reflecting the persona and goals provided to you.""",
        
        ),
        MessagesPlaceholder(variable_name="messages", optional=True),
    ]
)

@as_runnable
def generate_question(state: InterviewState):
    """ Node to generate a question """

    # Get state
    analyst = state["analyst"]
    messages = state["messages"]

    # Generate question 
    gen_question_chain = gen_qn_prompt.partial(persona=analyst) | llm   
    result = gen_question_chain.invoke({"messages": messages})
    
    # Write messages to state
    return {"messages": [result]}

In [26]:
from langchain_core.messages import AIMessage, HumanMessage

# Topic
topic = "Technical innovations related to the Meta LLlama3.1 LLM"

# Starter message
messages = [HumanMessage(f"So you said you were researching {topic}?")]

# Analyst role
role = "My focus is on the training infrasturcture -- such as # GPU, networking, etc -- used for llama3.1."

# Analyst to test 
interview_state = {"analyst": role, "messages": messages}

# Generate
question = generate_question.invoke(interview_state)

# Append to messages
messages.extend(question['messages'])

In [27]:
from pprint import pprint
for m in messages:
    m.pretty_print()


So you said you were researching Technical innovations related to the Meta LLlama3.1 LLM?

Hello there! I'm Dr. Emily Chen, an AI infrastructure analyst specializing in large language model architectures. Yes, that's correct - I'm particularly interested in the training infrastructure used for Meta's Llama 3.1 model. 

To start off, could you tell me about any specific details you know regarding the GPU setup used for training Llama 3.1? I'm especially curious about the number and type of GPUs that might have been employed.


### Question Answering

We have an expert with access to the [full llama3.1 paper](https://scontent-sjc3-1.xx.fbcdn.net/v/t39.2365-6/453304228_1160109801904614_7143520450792086005_n.pdf?_nc_cat=108&ccb=1-7&_nc_sid=3c67a6&_nc_ohc=LQo4g8kQLc4Q7kNvgH9HnsN&_nc_ht=scontent-sjc3-1.xx&oh=00_AYD-ZjbsTOnF3JKDvb8di6uXkqNlTg5u-8ZVvfG8sT8flg&oe=66C2DA07).

Download and supply local path.

Now, `formatted_pages` will be our fixed context that we want to [cache](https://www.anthropic.com/news/prompt-caching)! 

In [21]:
from langchain_community.document_loaders import PyPDFLoader

# Load llama3.1 paper (pdf)
loader = PyPDFLoader("/Users/rlm/Desktop/llama_3_1_paper.pdf")
pages = loader.load_and_split()

# Full paper, except for references 
all_pages_except_references=pages[:100]

# Format
formatted_pages = "\n --- \n".join(
    [
        f'<Document source="{doc.metadata["source"]}" page="{doc.metadata.get("page", "")}"/>\n{doc.page_content}\n</Document>'
        for doc in all_pages_except_references
    ]
)

In [22]:
# For RAG with context
system_message = {"role":"system",
            "content":[{
                "type":"text",
                "text":f"To answer question in the below conversation, use this context: {formatted_pages}",
                "cache_control":{"type":"ephemeral"}
            }]} 

@as_runnable
def generate_answer(state: InterviewState):
    """ Node to answer a question """

    # Get state
    messages = state["messages"]

    # Add context
    messages.insert(0, system_message)
   
    # Answer question
    answer = llm.invoke(messages)  
    
    # Name the message as coming from the expert
    answer.name = "expert"
    
    # Append it to state
    return {"messages": [answer]}

In [23]:
# Test
interview_state = {"messages": messages}
answer = generate_answer.invoke(interview_state)
messages.extend(answer['messages'])

In [32]:
for m in messages:
    m.pretty_print()


So you said you were researching Technical innovations related to the Meta LLlama3.1 LLM?

Hello there! I'm Dr. Emily Chen, an AI infrastructure analyst specializing in large language model architectures. Yes, that's correct - I'm particularly interested in the training infrastructure used for Meta's Llama 3.1 model. 

To start off, could you tell me about any specific details you know regarding the GPU setup used for training Llama 3.1? I'm especially curious about the number and type of GPUs that might have been employed.
Name: expert

[]
