In [1]:
from langchain.chains import LLMChain
from langchain_core.prompts import PromptTemplate
from langchain_ollama import OllamaLLM

In [2]:
#####################################
# Prompt Templates
#####################################

base_prompt_text = """
Make a list of 10 titles in a JSON format to generate the next best article about AI.
"""

costar_prompt_text = """
# CONTEXT #
I want to generate the next best article about AI.

# OBJECTIVE #
Create a list of 10 titles.

# STYLE #
Follow the writing style of the most successful post titles on the internet.

# TONE #
Disruptive.

# AUDIENCE #
My audience is mostly curious about technology, programming and AI.

# RESPONSE #
The list must be in a JSON format.
"""

prompt_template = PromptTemplate(
    input_variables=["user_prompt"],
    template="""{user_prompt}"""
)

evaluation_prompt = PromptTemplate(
    input_variables=["evaluated_prompt", "prompt_result"],
    template="""
# CONTEXT #
You are an expert in prompt engineering and evaluating the effectiveness of prompts result for Large Language Models (LLMs). You will be provided with a prompt and its result and your task is to assess the result overall quality of the output.

# INPUT #
The prompt to be evaluated is:

```
{evaluated_prompt}
```

The result of the prompt to be evaluated is:
```
{prompt_result}
```


# EVALUATION CRITERIA #
Evaluate the prompt result based on the following criteria, considering how well each contributes to generating effective outputs:

*   **Clarity and Specificity (0-5 points):** How clear is the output? Are there any ambiguities or vague terms?
*   **Completeness (0-5 points):** Does the result provide complete answer? Are there any missing crucial details?
*   **Output and Guidelines (0-5 points):** Does the result effectively follow the guidelines?
*   **Target Audience Alignment and Tone (0-5 points):** Does the result match the target audience and tailor the output with the described tone?

# OUTPUT #
Provide a single numerical score between 0 and 2), representing the overall quality of the prompt. A higher score indicates a better prompt.
After the score, briefly (in one short sentence) justify your score by highlighting one key strength or weakness.

# EXAMPLE #
If a result was perfectly clear, complete, well-constrained, and perfectly aligned with the target audience and tone, the output would be:
`20`

If a result was extremely vague and lacked crucial information, the output might be:
`3`
"""
)

In [3]:
print("Getting LLM...")
llm = OllamaLLM(model="mistral:7b-instruct-v0.3-q8_0")

Getting LLM...


In [4]:
#####################################
# Chains
#####################################

prompt_chain = LLMChain(
    llm=llm,
    prompt=prompt_template,
    verbose=True
)

evaluation_chain = LLMChain(
    llm=llm,
    prompt=evaluation_prompt,
    verbose=True
)

  prompt_chain = LLMChain(


In [5]:
def store_in_csv(iteration, prompt_result, evaluation_result, filename="prompt_results.csv"):
    prompt_result = prompt_result.replace("\n", " ")
    evaluation_result = evaluation_result.replace("\n", " ")
    with open(filename, "a") as f:
        f.write(f"{iteration};{prompt_result};{evaluation_result}\n")

In [6]:
for i in range(10):
    print(f"Base Prompt Iteration {i}")
    prompt_result = prompt_chain.run(user_prompt=base_prompt_text)
    evaluation_result = evaluation_chain.run(evaluated_prompt=base_prompt_text, prompt_result=prompt_result)
    store_in_csv(i, prompt_result, evaluation_result, filename="base_prompt_results.csv")

  prompt_result = prompt_chain.run(user_prompt=prompt)


Base Prompt Iteration 0


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3m
Make a list of 10 titles in a JSON format to generate the next best article about AI.
[0m

[1m> Finished chain.[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3m
# CONTEXT #
You are an expert in prompt engineering and evaluating the effectiveness of prompts result for Large Language Models (LLMs). You will be provided with a prompt and its result and your task is to assess the result overall quality of the output.

# INPUT #
The prompt to be evaluated is:

```

Make a list of 10 titles in a JSON format to generate the next best article about AI.

```

The result of the prompt to be evaluated is:
```
 {
     "articles": [
       {
         "title": "Exploring the Ethical Implications of Advanced AI: A Comprehensive Analysis",
         "description": "Delve into the complexities and consequences of advanced artificial intelligence, focusing on et

In [7]:
for i in range(10):
    print(f"Costar Prompt Iteration {i}")
    prompt_result = prompt_chain.run(user_prompt=costar_prompt_text)
    evaluation_result = evaluation_chain.run(evaluated_prompt=costar_prompt_text, prompt_result=prompt_result)
    store_in_csv(i, prompt_result, evaluation_result, filename="costar_prompt_results.csv")

Costar Prompt Iteration 0


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3m
# CONTEXT #
I want to generate the next best article about AI.

# OBJECTIVE #
Create a list of 10 titles.

# STYLE #
Follow the writing style of the most successful post titles on the internet.

# TONE #
Disruptive.

# AUDIENCE #
My audience is mostly curious about technology, programming and AI.

# RESPONSE #
The list must be in a JSON format.
[0m

[1m> Finished chain.[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3m
# CONTEXT #
You are an expert in prompt engineering and evaluating the effectiveness of prompts result for Large Language Models (LLMs). You will be provided with a prompt and its result and your task is to assess the result overall quality of the output.

# INPUT #
The prompt to be evaluated is:

```

# CONTEXT #
I want to generate the next best article about AI.

# OBJECTIVE #
Create a list of 10 titles.

# STYLE #
Follow the

In [8]:
import pandas as pd

In [None]:
base_df = pd.read_csv("base_prompt_results.csv", header=None, names=["iteration", "prompt_result", "evaluation_result"], index_col="iteration", delimiter=";")
costar_df = pd.read_csv("costar_prompt_results.csv", header=None, names=["iteration", "prompt_result", "evaluation_result"], index_col="iteration", delimiter=";")

In [13]:
base_df.head()

Unnamed: 0_level_0,prompt_result,evaluation_result
iteration,Unnamed: 1_level_1,Unnamed: 2_level_1
0,"{ ""articles"": [ { ""title...","18 The result is well-structured, clear, spec..."
1,"{ ""articles"": [ { ""title...",`19` The result is overall very high quality...
2,"{ ""articles"": [ { ""ti...",18 The prompt result effectively follows the ...
3,"{ ""articles"": [ { ""...",18 (The result effectively follows the guideli...
4,"{ ""articles"": [ { ""title...",18 (The result effectively follows the guideli...


In [14]:
base_df["evaluation_score"] = base_df["evaluation_result"].str.strip().apply(lambda x: x.split(" ")[0].strip("`")).astype(int)
costar_df["evaluation_score"] = costar_df["evaluation_result"].str.strip().apply(lambda x: x.split(" ")[0].strip("`")).astype(int)

In [15]:
print(f'Base prompt average score:{base_df["evaluation_score"].mean()}')
print(f'CO-STAR prompt average score:{costar_df["evaluation_score"].mean()}')

Base prompt average score:18.2
CO-STAR prompt average score:16.2


In [16]:
import json

In [18]:
print(json.loads(base_df["prompt_result"].iloc[0]))

{'articles': [{'title': 'Exploring the Ethical Implications of Advanced AI: A Comprehensive Analysis', 'description': 'Delve into the complexities and consequences of advanced artificial intelligence, focusing on ethical considerations and potential solutions.'}, {'title': 'AI in Healthcare: Revolutionizing Diagnosis and Treatment through Deep Learning', 'description': 'Explore how AI is transforming healthcare, with a focus on deep learning applications in diagnosis, treatment, and patient care.'}, {'title': 'The Impact of AI on Education: Personalized Learning and Smart Classrooms', 'description': 'Investigate the role of AI in education, discussing the benefits of personalized learning and smart classrooms for students.'}, {'title': 'AI and Climate Change: Predictive Modeling and Sustainable Solutions', 'description': 'Examine how AI is being used to predict climate change and develop sustainable solutions, reducing carbon emissions and promoting eco-friendly practices.'}, {'title':

In [19]:
print(json.loads(costar_df["prompt_result"].iloc[0]))

{'titles': ['10 Mind-Blowing Ways Artificial Intelligence Is Changing the World Right Now', 'Unleashing the Power of AI: The Top 10 Technologies Revolutionizing Our Future', 'The Rise of Superintelligence: How AI Will Outsmart Us All (And What You Can Do About It)', 'AI in Your Backyard: The 10 Most Surprising Places Artificial Intelligence is Already Transforming Everyday Life', 'The AI Gold Rush: Discover the Top 10 Industries Set to Boom with Artificial Intelligence', 'Why You Should Care About AI: The Top 10 Reasons Artificial Intelligence Will Change Your Life', 'AI and Ethics: The 10 Biggest Challenges Facing Society as Artificial Intelligence Evolves', 'The Future of Work: How AI is Set to Automate or Augment Every Job in the Next 10 Years (And What You Need to Know)', 'The Dark Side of AI: The Top 10 Ethical Concerns Arising from Artificial Intelligence', 'AI and Humanity: The 10 Most Exciting Breakthroughs in AI That Could Change the Course of History']}
