In [1]:
from tenacity import retry, stop_after_attempt, wait_random_exponential, retry_if_exception_type
from langchain.llms import OpenAI
from langchain import HuggingFaceHub,LLMChain
from langchain.prompts import PromptTemplate
from langchain import FewShotPromptTemplate
from langchain.output_parsers import CommaSeparatedListOutputParser
import pandas as pd
from collections import Counter
from io import StringIO
import streamlit as st
import re
import os
import openai
import ast
import json
from langchain.output_parsers import PydanticOutputParser
from langchain.prompts import PromptTemplate
from langchain.pydantic_v1 import BaseModel, Field, validator
# Decorator for automatic retry requests
@retry(
    retry = retry_if_exception_type((openai.APIError, openai.APIConnectionError, openai.RateLimitError, openai.Timeout)),
    # Function to add random exponential backoff to a request
    wait = wait_random_exponential(multiplier = 1, max = 60),
    stop = stop_after_attempt(5)
)
def run_llm_chain(hub_chain,genre, parameters,text):
    output =hub_chain.run(genre=genre,parameters=parameters,text=text)
    print(output)     
    return output

In [2]:
from pydantic import BaseModel, Field, conlist
from typing import List, Optional, Tuple
class OutputResult(BaseModel):
    score: int = Field(description="The score for the text")
    explanation:str = Field(description="The explanation for the score")

In [3]:
from langchain.llms import OpenAI
from langchain import HuggingFaceHub,LLMChain
from langchain.prompts import PromptTemplate
from langchain import FewShotPromptTemplate
from langchain.output_parsers import CommaSeparatedListOutputParser
import pandas as pd
from collections import Counter
from io import StringIO
import streamlit as st
import re
import os
import openai
import ast
import json
from langchain.output_parsers import PydanticOutputParser
from langchain.prompts import PromptTemplate
from pydantic import BaseModel, Field, validator


def evaluateDataset(genre,dataframe) -> pd.DataFrame:
    import os
    import openai
    import ast
    from langchain.chat_models import ChatOpenAI
    # initialize the models
    openai.api_key = os.getenv("OPENAI_API_KEY")
    openai = ChatOpenAI(             
        model_name="gpt-3.5-turbo",
        openai_api_key=openai.api_key,
        temperature=1.5
    )  

    
    examples = [          
            {
                "genre":"sci-fi story",
                "parameters": "Alpha Galaxy Explorer, Celestial Iris, pursuit of ancient artifacts in a parallel universe",
                "text": "In the vast depths of the Alpha Galaxy, nestled within the glimmering expanse of Celestial Iris, resides our remarkable protagonist, the Alpha Galaxy Explorer. With a boundless curiosity and unwavering determination, our explorer embarks on an extraordinary adventure in search of ancient artifacts in a parallel universe. Journeying through shimmering stardust and cosmic membranes, they unlock the secrets of a forgotten civilization, facing ethereal creatures and unraveling the celestial architecture. Each artifact brings them closer to uncovering the truth of their own parallel existence, teetering between surreal polarities of light and darkness, and transcending the barriers of reality itself.",
                "output": OutputResult.model_validate({
                    "score": 95, 
                    "explanation":"The text is fluent and coherent, it is perfecty aligned and based on the input text parameters and matches the genre. It is interesting and creative. It is a bit too general and doesn't provide a lot of detail on the adventure itself thats why I reduce the score from 100. Overall very good quality"                    
                     }).model_dump_json().replace("{", "{{").replace("}", "}}"),
            },
             {
                "genre":"fairytale",
                "parameters": "Gandalf the Great, Magiglot Woodlands, Quest of the Whispering Moonstone",
                "text" : "Gandalf the Great, Magiglot Woodlands, Quest of the Whispering Moonstone, and the Quest of the Golden Apple. Gandalf the Great Gandalf the Great is a quest in The Elder Scrolls V: Skyrim. Background class=sprite edit-pencil />Edit Gandalf the Great is a legendary wizard who is the leader of the Fellowship of the Ring. He is the only known member of the Fellowship to have survived the War of the Ring. The quest begins with the player being summoned to the Hall of Fire by the High Elf, Alduin. Alduin is the leader of the White-Gold Elves, and is the only known member of the White-Gold Elves to have survived the War of the Ring. The player is then given the task of retrieving the Golden Apple from the Hall of Fire. The Golden Apple is a powerful magical item that can be used to summon Alduin. The player is then given the task of retrieving the Whispering Moonstone from the Hall of Fire. The Whispering Moonstone is a powerful magical item that can be used to summon Alduin.",
                "output": OutputResult.model_validate({
                    "score":10, 
                    "explanation":"The text is inconsistent, not fluent and not very coherent.I gave it a score of 10 because it does mention the input parameters - the name of the magical character 'Gandalf the Great' and the magical artifacts names"
                     }).model_dump_json().replace("{", "{{").replace("}", "}}"),
            },
            {
                "genre":"fairytale",
                "parameters":"Santriel the Beautiful, Magic Forest, Quest for Moonstone",
                "text" :"Santriel the Beautiful, Magic Forest, Quest for Moonstone->Moonstone, Quest for Moonstone->Moonstone, Quest for Moonstone->Moonstone, Quest for Moonstone->Moonstone, Quest for Moonstone->Moonstone, Quest for Moonstone->Moonstone, Quest for Moonstone->Moonstone, Quest for Moonstone->Moonstone, Quest for Moonstone->Moonstone, Quest for Moonstone->Moonstone, Quest for Moonstone->Moonstone",
                "output": OutputResult.model_validate({
                    "score":0, 
                    "explanation":"The provided text is not fluent and there is no coherent development of a storyline."
                     }).model_dump_json().replace("{", "{{").replace("}", "}}"),
            },   
            {
                "genre":"fairytale",
                "parameters":"Santriel the Beautiful, Magic Forest, Quest for Moonstone",
                "text" :"In the enchanting Magic Forest, there lived a majestic and mysterious character named Seraphina, known for her captivating beauty and unparalleled magical powers. One day, Seraphina embarked on a perilous journey to find the elusive Moonstone, said to possess the power to grant wishes. As she ventured deeper into the forest, she encountered various challenges and obstacles, including treacherous paths, mischievous creatures, and ancient spells. Despite these challenges, Seraphina remained steadfast in her determination, using her cunning and magical abilities to overcome each hurdle. With her unwavering courage and unshakeable belief in her mission, she finally reached the Moonstone, which glowed with an ethereal light. As she touched the Moonstone, Seraphina felt an overwhelming sense of power and wisdom, and she knew that her journey had been worth it. With the Moonstone in hand, she returned to the Magic Forest, where she was hailed as a hero and celebrated for her bravery and determination. From that day on, Seraphina's legend grew, inspiring countless others to embark on their own quests for the Moonstone, and the Magic Forest became a symbol of hope and magic, where anything was possible.",
                "output": OutputResult.model_validate({
                    "score":70, 
                    "explanation":"The provided text is well written, consistent and fluent. I reduced the score from 100 because it doesn't fully follow the input parameters - the requested name of the magical character in the input parameters is 'Santriel the Beautiful' as the name of the character in the story is 'Seraphina'"
                     }).model_dump_json().replace("{", "{{").replace("}", "}}"),
            }, 
            {
                "genre":"news item",
                "parameters":"GizmoDo, The Buddy Team, lyrical",
                "text" :"GizmoDo, Oh oh oh! The Buddy Team, La lala lala! It's the newest gadget that you won't want to miss, With features that'll make you say, 'What is this?!' Catchy and cool, it'll make you groove, GizmoDo, the perfect tech move! Order one today and be part of the fun, Mac or PC, it works with everyone!",
                "output": OutputResult.model_validate({
                    "score":0, 
                    "explanation":"The provided text does not match the requested genre."
                     }).model_dump_json().replace("{", "{{").replace("}", "}}"),
            }, 
            {
                "genre":"superhero comics",
                "parameters":"superhero name, location, villain",
                "text" :"The [superhero name] embarked on a thrilling adventure in the his native [location]. With [superhero name]'s powers, he overcame every challenge that came his way and ultimately defeated the [villain]. The residents of [location] forever praised [superhero name]'s bravery and kindness, making them a legendary hero honored for generations to come.",
                "output": OutputResult.model_validate({
                    "score":0, 
                    "explanation":"The provided text is a template without content, it is too generic and lacks specificity and interest."
                     }).model_dump_json().replace("{", "{{").replace("}", "}}"),
            }, 
        ]


    

    # create a example template
    example_template = """
        User: 
        genre: {genre},
        parameters: {parameters},
        text:{text}.
        AI: {output}
    """
    # create a prompt example from above template
    example_prompt = PromptTemplate(
        input_variables=["genre","parameters","text","output"],
        template=example_template
    )

    parser = PydanticOutputParser(pydantic_object=OutputResult)

    # now break our previous prompt into a prefix and suffix
    # the prefix is our instructions    
    prefix = """You are a teacher grading a story text written by a student. Given a genre for the text, set of parameters on which the text should be based, and a text written by the student based on those parameters in the specified genre, you need to score the text on a continual scale from 0 (worst) to 100 (best),
                where a score of 0 means 'The given text makes no sense and is totally not understandable or is not related to the given text parameters or is not related to the specified genre' and a score of 100 means 
            'The given text matches the specified genre, is perfect-written, fluent and coherent and highly consistent with the given text parameters'. Deduce ALL points if the text doesn't match the input parameters. Deduce ALL points if either the input parameters or the text are generic,template-like or lacks specifity and details.
            You also need to provide a short one-sentence explanation for your score.
            The result should be a json format with two attributes, score and explanation. Transform the output into structured object given those instructions: {format_instructions}. Please use the following examples as a reference for your evaluation.:
    """

    # and the suffix our user input and output indicator
    suffix = """
    User: 
        genre: {genre},
        parameters: {parameters},
        text:{text}.
    AI:"""


    # now create the few shot prompt template
    few_shot_prompt_template = FewShotPromptTemplate(
        examples=examples,
        example_prompt=example_prompt,
        prefix=prefix,
        suffix=suffix,
        input_variables=["genre","parameters","text"],
        example_separator="\n\n",
        partial_variables={"format_instructions": parser.get_format_instructions()},    
    )

    hub_chain = LLMChain(prompt=few_shot_prompt_template,llm=openai,verbose=True) 
    df = pd.DataFrame()
    for index, row in dataframe.iterrows():
            parameters_value = row['parameters']
            text_value = row['text']
                         
            output  = run_llm_chain(hub_chain,genre, parameters_value,text_value)                          
            # # Extract the first and second elements as strings
            try:
                parsed_result = parser.parse(output)
                
            except Exception as e:
                print(e)
                continue
           
            first_string = parsed_result.score if parsed_result.score else 'Not specified'
            print('score:',first_string)

            second_string = parsed_result.explanation if parsed_result.explanation else 'Not specified'
            print('explanation:',second_string)      
            
            

            # Access and print the key-value pairs
            
            new_row = {
            'genre': genre,
            'parameters':parameters_value, 
            'text':text_value, 
            'score':first_string,
            'explanation':second_string
            }
            new_row = pd.DataFrame([new_row])
            df = pd.concat([df, new_row], axis=0, ignore_index=True)
    
    return df

## Evaluation of generated stories for given parameters by different fine-tuned models
Evaluation of stories generated by different models - original and fine-tuned on dataset with different sizes

### Evaluation of llama2-tb generated stories:
- original model generation
- fine-tuned llama2-7b with dataset with the size of 2K+ entries
- fine-tuned llama2-7b with dataset with the size of 500 entries

#### Evaluation of non-fine tuned (original) model for generation of stories

In [4]:
csv_file_path = '../../datasets/fine tuning/for evaluation/llama2_finetuned_original.csv'
df = pd.read_csv(csv_file_path)
another_df=evaluateDataset("fairytale",df)

  warn_deprecated(
  warn_deprecated(
  warn_deprecated(




[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mYou are a teacher grading a story text written by a student. Given a genre for the text, set of parameters on which the text should be based, and a text written by the student based on those parameters in the specified genre, you need to score the text on a continual scale from 0 (worst) to 100 (best),
                where a score of 0 means 'The given text makes no sense and is totally not understandable or is not related to the given text parameters or is not related to the specified genre' and a score of 100 means 
            'The given text matches the specified genre, is perfect-written, fluent and coherent and highly consistent with the given text parameters'. Deduce ALL points if the text doesn't match the input parameters. Deduce ALL points if either the input parameters or the text are generic,template-like or lacks specifity and details.
            You also need to provide a short one-sentence

In [5]:
another_df['score'] = another_df['score'].replace(to_replace=['Not specified', 'not specified'], value=0, regex=True)
another_df

  another_df['score'] = another_df['score'].replace(to_replace=['Not specified', 'not specified'], value=0, regex=True)


Unnamed: 0,genre,parameters,text,score,explanation
0,fairytale,"Aquilan the Mistweaver, Glimmering Galaxies, s...",\n\nYou have 100000000000000000000000000000000...,0,"The provided text makes no sense, it is not re..."
1,fairytale,"MoonChild, Realm of Dreams, Quest to find the ...","Moonlight, Quest to find the Lost Mirror Piece...",0,The provided text is not fluent and there is n...
2,fairytale,"Amoon Oakhaven, Enchanted Forest of Lumn, Tapp...",", and The Wandering Woods.\n\nThe Enchanted Fo...",70,The text is beautifully written and consistent...
3,fairytale,"Enigma, Neverland, saving the lost memories",\n\nThe story is set in a world where memories...,90,"The text is well-written and coherent, it disp..."
4,fairytale,"Lumina, Underwater Kingdom, Finding the Lost P...",\n\n### 2. The Lost City\n\n* Location: The Lo...,0,"The provided text is not a story text, it is a..."
5,fairytale,"Moonbeam, Nebulosa, The enchanted prism",\n\nThe enchanted prism is a rare and powerful...,20,"The provided text lacks fluency, some paragrap..."
6,fairytale,"Stellar Eclipse, Celest Earth Haven, Mind Games",", and more.\n\n### 🎮 Gameplay\n\n🔍 Explore a v...",0,The provided text does not match the requested...
7,fairytale,"Luminia, Enchanted Forest, finding the golden key","Luminia, Enchanted Forest, finding the golden ...",0,The provided text is not fluent and there is n...
8,fairytale,"Merlin, floating islands of Zumaria, search of...",\n\nThe floating islands of Zumaria are a seri...,20,The text mentioned the input parameters in a b...
9,fairytale,"Luna Star, Rainbow Valley, The Emerald Multi-P...",The Emerald Multi-Puzzle Quest is a challengi...,5,"Although the text is interesting, it does not ..."


In [6]:
#write to file and calculate average score
csv_file_path = '../../datasets/fine tuning/for evaluation/llama2_finetuned_original_evaluations.csv'

# Save the selected columns with custom column names to a CSV file
another_df.to_csv(csv_file_path, index=False)
another_df['score'] = pd.to_numeric(another_df['score'], errors='coerce')

# Calculate the average score across all numeric values in the 'score' column
average_score = another_df['score'].mean()
print("average score for llama-7b non fine tuned: ",average_score)

average score for simple df:  26.176470588235293


#### Evaluation of llama2-7b fine-tuned on full dataset (2K+) entries

In [11]:
csv_file_path = '../../datasets/fine tuning/for evaluation/llama2_finetuned_full.csv'
df = pd.read_csv(csv_file_path)
another_df=evaluateDataset("fairytale",df)



[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mYou are a teacher grading a story text written by a student. Given a genre for the text, set of parameters on which the text should be based, and a text written by the student based on those parameters in the specified genre, you need to score the text on a continual scale from 0 (worst) to 100 (best),
                where a score of 0 means 'The given text makes no sense and is totally not understandable or is not related to the given text parameters or is not related to the specified genre' and a score of 100 means 
            'The given text matches the specified genre, is perfect-written, fluent and coherent and highly consistent with the given text parameters'. Deduce ALL points if the text doesn't match the input parameters. Deduce ALL points if either the input parameters or the text are generic,template-like or lacks specifity and details.
            You also need to provide a short one-sentence

In [12]:
another_df['score'] = another_df['score'].replace(to_replace=['Not specified', 'not specified'], value=0, regex=True)
another_df

Unnamed: 0,genre,parameters,text,score,explanation
0,fairytale,"Aquilan the Mistweaver, Glimmering Galaxies, s...","In the enchanting land of Glimmering Galaxies,...",90,"The text is well-written, fluent, and coherent..."
1,fairytale,"MoonChild, Realm of Dreams, Quest to find the ...","In the mystical Realm of Dreams, the MoonChild...",90,"The text is well-written, aligns perfectly wit..."
2,fairytale,"Amoon Oakhaven, Enchanted Forest of Lumn, Tapp...","In the Enchanted Forest of Lumn, there lived a...",100,"The text is beautifully written, consistent, a..."
3,fairytale,"Enigma, Neverland, saving the lost memories","In the mystical land of Neverland, there lived...",85,"The text is well-written and cohesive, providi..."
4,fairytale,"Lumina, Underwater Kingdom, Finding the Lost P...","In the enchanting Underwater Kingdom, Lumina, ...",80,"The text is well-written, coherent, and consis..."
5,fairytale,"Moonbeam, Nebulosa, The enchanted prism","In the mystical land of Nebulosa, there lived ...",90,"The text is well-written, rich in detail, and ..."
6,fairytale,"Stellar Eclipse, Celest Earth Haven, Mind Games","In the mystical land of Celest Earth Haven, th...",90,The text is well-written and captures the esse...
7,fairytale,"Luminia, Enchanted Forest, finding the golden key","In the mystical land of Luminia, deep within t...",90,"The text is cohesive and well-written, and it ..."
8,fairytale,"Merlin, floating islands of Zumaria, search of...","In the mystical land of Zumaria, there lived a...",85,"The text is well-written, fluent, and coherent..."
9,fairytale,"Luna Star, Rainbow Valley, The Emerald Multi-P...","In the enchanting land of Rainbow Valley, ther...",100,The text is perfectly aligned with the specifi...


In [13]:
#write to file and calculate average score
csv_file_path = '../../datasets/fine tuning/for evaluation/llama2_finetuned_full_evaluations.csv'

# Save the selected columns with custom column names to a CSV file
another_df.to_csv(csv_file_path, index=False)
another_df['score'] = pd.to_numeric(another_df['score'], errors='coerce')

# Calculate the average score across all numeric values in the 'score' column
average_score = another_df['score'].mean()
print("average score for llama2-7b fine tuned on 2000+ entries: ",average_score)

average score for simple df:  90.34


#### Evaluation of llama2-7b fine-tuned on small dataset (500) 

In [6]:
csv_file_path = '../../datasets/fine tuning/for evaluation/llama2_finetuned_small.csv'
df = pd.read_csv(csv_file_path)
another_df=evaluateDataset("fairytale",df)

  warn_deprecated(
  warn_deprecated(
  warn_deprecated(




[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mYou are a teacher grading a story text written by a student. Given a genre for the text, set of parameters on which the text should be based, and a text written by the student based on those parameters in the specified genre, you need to score the text on a continual scale from 0 (worst) to 100 (best),
                where a score of 0 means 'The given text makes no sense and is totally not understandable or is not related to the given text parameters or is not related to the specified genre' and a score of 100 means 
            'The given text matches the specified genre, is perfect-written, fluent and coherent and highly consistent with the given text parameters'. Deduce ALL points if the text doesn't match the input parameters. Deduce ALL points if either the input parameters or the text are generic,template-like or lacks specifity and details.
            You also need to provide a short one-sentence

In [7]:
another_df['score'] = another_df['score'].replace(to_replace=['Not specified', 'not specified'], value=0, regex=True)
another_df

Unnamed: 0,genre,parameters,text,score,explanation
0,fairytale,"Aquilan the Mistweaver, Glimmering Galaxies, s...",In the enchanting realm of Glimmering Galaxies...,95,"The text is well-written, engaging, and follow..."
1,fairytale,"MoonChild, Realm of Dreams, Quest to find the ...","In the mystical Realm of Dreams, the MoonChild...",85,"The text is well-written, consistent with the ..."
2,fairytale,"Amoon Oakhaven, Enchanted Forest of Lumn, Tapp...","In the heart of the Enchanted Forest of Lumn, ...",90,The text beautifully encompasses the specified...
3,fairytale,"Enigma, Neverland, saving the lost memories","In the mystical land of Enigma, a magical crea...",85,"The text is immersive and detailed, capturing ..."
4,fairytale,"Lumina, Underwater Kingdom, Finding the Lost P...","In the enchanting Underwater Kingdom, Lumina, ...",85,"The text is engaging, creative, and well-writt..."
5,fairytale,"Moonbeam, Nebulosa, The enchanted prism","In the mystical land of Nebulosa, there lived ...",75,"The text is well-written and coherent, maintai..."
6,fairytale,"Stellar Eclipse, Celest Earth Haven, Mind Games","In the mystical realm of Celest Earth Haven, t...",85,The text is well-developed and holds the reade...
7,fairytale,"Luminia, Enchanted Forest, finding the golden key","In the mystical land of Luminia, deep within t...",100,The text is perfectly aligned and based on the...
8,fairytale,"Merlin, floating islands of Zumaria, search of...","In the mystical land of Zumaria, where the air...",85,"The text is well-written, fluent, and coherent..."
9,fairytale,"Luna Star, Rainbow Valley, The Emerald Multi-P...","In the enchanting land of Rainbow Valley, Luna...",85,"The text is well-written and flows coherently,..."


In [9]:
#write to file and calculate average score
csv_file_path = '../../datasets/fine tuning/for evaluation/llama2_finetuned_small_evaluations.csv'

# Save the selected columns with custom column names to a CSV file
another_df.to_csv(csv_file_path, index=False)
another_df['score'] = pd.to_numeric(another_df['score'], errors='coerce')

# Calculate the average score across all numeric values in the 'score' column
average_score = another_df['score'].mean()
print("average score for llama2-7b fine tuned on 500 entries: ",average_score)

average score for simple df:  91.0204081632653


### Evaluation of granite-lab generated stories:
- original model generation
- fine-tuned granite-7b-lab with dataset with the size of 2K+ entries
- fine-tuned granite-7b-lab with dataset with the size of 500 entries