In [55]:
import pandas as pd
import plotly.express as px
from rl.llm_agent import LLMAgent
from rl.environment import Environment
from rl.code_evaluator import CodeEvaluator
from rl.policies import EpsilonGreedyPolicy
from rl.utils import compute_reviewer_grade, is_terminate_grade

MAX_EPOCHS = 100

In [56]:
# Initialize the problem
environment = Environment()
agent_coder = LLMAgent(
    prompts=["Worst prompt coder", "Bad prompt coder", "Neutral prompt coder", "Good prompt coder", "Best prompt coder"],
    initial_value=10,
    policy=EpsilonGreedyPolicy(0.1),
    name="Coder"
)
agent_evaluator = CodeEvaluator(
    environment=environment,
    prompt="Evaluate the code."
)
agent_reviewer = LLMAgent(
    prompts=["Best prompt reviewer", "Good prompt reviewer", "Neutral prompt reviewer", "Bad prompt reviewer", "Worst prompt reviewer"],
    initial_value=10,
    policy=EpsilonGreedyPolicy(0.1),
    name="Reviewer"
)

In [57]:
# Adds user message
environment.add_message("User", "code something for me.")

In [58]:
# Iterate over the epochs
last_grade = None
hist_coder_rewards = []
hist_reviewer_rewards = []

for epoch in range(MAX_EPOCHS):
    # The coder generates a message
    agent_coder.add_message(environment)
    # The evaluator evaluates the code
    grade = agent_evaluator.evaluate_code()
    # The coder receives the reward
    agent_coder.reward(grade)
    hist_coder_rewards.append(grade)
    # The reviewer gets a reward based on the coder's grade improvement
    if last_grade is not None:
        reviewer_grade = compute_reviewer_grade(grade, last_grade)
        agent_reviewer.reward(reviewer_grade)
        hist_reviewer_rewards.append(reviewer_grade)
    # If the grade is high enough, the problem is solved
    if is_terminate_grade(grade):
        environment.add_message("System", "✅ Code approved. Conversation terminated.")
        break
    # If not, the reviewer generates a message
    else:
        agent_reviewer.add_message(environment)
    last_grade = grade
    # And the loop continues

In [59]:
# Print environment messages
environment.print_messages()

User: code something for me.
Coder: This is a generated text with the prompt: Neutral prompt coder
Code Evaluator: The code was graded as 3
Reviewer: This is a generated text with the prompt: Best prompt reviewer
Coder: This is a generated text with the prompt: Worst prompt coder
Code Evaluator: The code was graded as 6
Reviewer: This is a generated text with the prompt: Good prompt reviewer
Coder: This is a generated text with the prompt: Bad prompt coder
Code Evaluator: The code was graded as 6
Reviewer: This is a generated text with the prompt: Neutral prompt reviewer
Coder: This is a generated text with the prompt: Good prompt coder
Code Evaluator: The code was graded as 7
Reviewer: This is a generated text with the prompt: Bad prompt reviewer
Coder: This is a generated text with the prompt: Best prompt coder
Code Evaluator: The code was graded as 7
Reviewer: This is a generated text with the prompt: Worst prompt reviewer
Coder: This is a generated text with the prompt: Good prompt

## Charts for Coder

In [60]:
fig_1 = px.line(
    hist_coder_rewards,
    markers=True,
    title="Rewards for Coder over time"
)
fig_1.show()

In [61]:
coder_action_counts = agent_coder.agent.action_counts
coder_prompts = agent_coder.prompts
prompt_counts_df = pd.DataFrame({
    "Prompt": coder_prompts,
    "Count": coder_action_counts
})
fig_2 = px.bar(
    prompt_counts_df,
    x="Prompt",
    y="Count",
    title="Prompt counts for Coder"
)
fig_2.show()

## Charts for Reviewer

In [62]:
fig_1 = px.line(
    hist_reviewer_rewards,
    markers=True,
    title="Rewards for Reviewer over time"
)
fig_1.show()

In [63]:
reviewer_action_counts = agent_reviewer.agent.action_counts
reviewer_prompts = agent_reviewer.prompts
prompt_counts_df = pd.DataFrame({
    "Prompt": reviewer_prompts,
    "Count": reviewer_action_counts
})
fig_2 = px.bar(
    prompt_counts_df,
    x="Prompt",
    y="Count",
    title="Prompt counts for Reviewer"
)
fig_2.show()