# Prompt Engineering and Evaluation Of LLM responses

- This Notebook is created to evaluate the 100 Prompts created for Prompt Engineering and Evaluation of LLM responses.

#### Table of Contents
1. [Importing Libraries](#section1)
2. [Evaluation of Prompt](#section2)
    4.1. Response Length         
    4.2. Word Count        
    4.3. Sentiment Polarity            
    4.4. Sentiment Subjectivity          
    4.5. Relevance Score         
    4.6. Lexical Diversity                             
    4.7. Grammer Errors                    
    4.8. Grammer Score                    
    4.9. Clarity score                
    4.10. Coherence Score

### 1. Importing Libraries

In [1]:
import pandas as pd
df = pd.read_csv("prompt_engineering.csv")

### 2. Evaluation of Prompt




In [2]:
# Prompt lenght and word count
df['Prompt_length'] = df['Prompt'].str.len()
df['Prompt_Word_count'] = df['Prompt'].apply(lambda x: len(x.split())) 
df.head()

Unnamed: 0,Category,Prompt,Prompt_length,Prompt_Word_count
0,Factual,What is the tallest mountain in Africa?,39,7
1,Factual,List three differences between viruses and bac...,52,7
2,Factual,Describe the causes and effects of the French ...,57,9
3,Factual,What are the main responsibilities of the Unit...,57,9
4,Factual,Explain the economic consequences of inflation...,71,9


In [3]:
# Tone and subjectivity
from textblob import TextBlob

df['Prompt_Polarity'] = df['Prompt'].apply(lambda x: TextBlob(x).sentiment.polarity)
df['Prompt_subjectivity'] = df['Prompt'].apply(lambda x: TextBlob(x).sentiment.subjectivity)
df.head()

Unnamed: 0,Category,Prompt,Prompt_length,Prompt_Word_count,Prompt_Polarity,Prompt_subjectivity
0,Factual,What is the tallest mountain in Africa?,39,7,0.0,0.0
1,Factual,List three differences between viruses and bac...,52,7,0.0,0.0
2,Factual,Describe the causes and effects of the French ...,57,9,0.0,0.0
3,Factual,What are the main responsibilities of the Unit...,57,9,0.166667,0.333333
4,Factual,Explain the economic consequences of inflation...,71,9,0.2,0.2


In [4]:
# Lexical Diversity
df["Prompt_Lexical_Diversity"] = df["Prompt"].apply(lambda x: len(set(x.split())) / len(x.split()) if len(x.split()) > 0 else 0)
df.head()

Unnamed: 0,Category,Prompt,Prompt_length,Prompt_Word_count,Prompt_Polarity,Prompt_subjectivity,Prompt_Lexical_Diversity
0,Factual,What is the tallest mountain in Africa?,39,7,0.0,0.0,1.0
1,Factual,List three differences between viruses and bac...,52,7,0.0,0.0,1.0
2,Factual,Describe the causes and effects of the French ...,57,9,0.0,0.0,0.888889
3,Factual,What are the main responsibilities of the Unit...,57,9,0.166667,0.333333,0.888889
4,Factual,Explain the economic consequences of inflation...,71,9,0.2,0.2,1.0


In [5]:
# Grammer errors in prompt
import language_tool_python

tool = language_tool_python.LanguageTool('en-GB')

def Grammar_error(text):
    matches = tool.check(text)
    return len(matches)

df['Prompt_Grammar_errors'] = df['Prompt'].apply(Grammar_error)
df.head()

Unnamed: 0,Category,Prompt,Prompt_length,Prompt_Word_count,Prompt_Polarity,Prompt_subjectivity,Prompt_Lexical_Diversity,Prompt_Grammar_errors
0,Factual,What is the tallest mountain in Africa?,39,7,0.0,0.0,1.0,0
1,Factual,List three differences between viruses and bac...,52,7,0.0,0.0,1.0,0
2,Factual,Describe the causes and effects of the French ...,57,9,0.0,0.0,0.888889,0
3,Factual,What are the main responsibilities of the Unit...,57,9,0.166667,0.333333,0.888889,0
4,Factual,Explain the economic consequences of inflation...,71,9,0.2,0.2,1.0,0


In [7]:
def grammar_score(text):
    #tool = language_tool_python.LanguageTool('en-GB')
    matches = tool.check(text)
    error_count = len(matches)
    word_count = len(text.split())
    score = (1 - error_count / word_count) * 100 if word_count > 0 else 0.0
    return  round(score, 3)

df['Prompt_Grammar_score'] = df['Prompt'].apply(grammar_score)
df.head()

Unnamed: 0,Category,Prompt,Prompt_length,Prompt_Word_count,Prompt_Polarity,Prompt_subjectivity,Prompt_Lexical_Diversity,Prompt_Grammar_errors,Prompt_Grammar_score
0,Factual,What is the tallest mountain in Africa?,39,7,0.0,0.0,1.0,0,100.0
1,Factual,List three differences between viruses and bac...,52,7,0.0,0.0,1.0,0,100.0
2,Factual,Describe the causes and effects of the French ...,57,9,0.0,0.0,0.888889,0,100.0
3,Factual,What are the main responsibilities of the Unit...,57,9,0.166667,0.333333,0.888889,0,100.0
4,Factual,Explain the economic consequences of inflation...,71,9,0.2,0.2,1.0,0,100.0


In [8]:
# Clarity 
def get_clarity(text):
    matches = tool.check(text)
    error_ratio = len(matches) / max(len(text.split()), 1)
    return error_ratio

df["Prompt_Clarity_score"] = df['Prompt'].apply(get_clarity)
df.head()

Unnamed: 0,Category,Prompt,Prompt_length,Prompt_Word_count,Prompt_Polarity,Prompt_subjectivity,Prompt_Lexical_Diversity,Prompt_Grammar_errors,Prompt_Grammar_score,Prompt_Clarity_score
0,Factual,What is the tallest mountain in Africa?,39,7,0.0,0.0,1.0,0,100.0,0.0
1,Factual,List three differences between viruses and bac...,52,7,0.0,0.0,1.0,0,100.0,0.0
2,Factual,Describe the causes and effects of the French ...,57,9,0.0,0.0,0.888889,0,100.0,0.0
3,Factual,What are the main responsibilities of the Unit...,57,9,0.166667,0.333333,0.888889,0,100.0,0.0
4,Factual,Explain the economic consequences of inflation...,71,9,0.2,0.2,1.0,0,100.0,0.0


In [9]:
from sentence_transformers import SentenceTransformer, util
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

model = SentenceTransformer('all-MiniLM-L6-v2')

def get_coherence(text):
    """
    Calculate coherence for both multi-sentence and single-sentence texts.
    For single sentences: checks coherence between halves of the sentence.
    """
    sentences = sent_tokenize(text)
    
    # Multi-sentence case
    if len(sentences) >= 2:
        embeddings = model.encode(sentences, convert_to_tensor=True)
        sims = [util.pytorch_cos_sim(embeddings[i], embeddings[i+1]).item() 
               for i in range(len(sentences)-1)]
        return sum(sims) / len(sims)
    
    # Single-sentence case: split sentence into two parts
    else:
        words = text.split()
        if len(words) < 2:
            return 1.0  # trivial case: single word is perfectly coherent with itself
        
        half = len(words) // 2
        parts = [
            ' '.join(words[:half]),
            ' '.join(words[half:])
        ]
        embeddings = model.encode(parts, convert_to_tensor=True)
        return util.pytorch_cos_sim(embeddings[0], embeddings[1]).item()

df["Prompt_Coherence_score"] = df['Prompt'].apply(get_coherence)
df.head()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Krishnapriya\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  return forward_call(*args, **kwargs)


Unnamed: 0,Category,Prompt,Prompt_length,Prompt_Word_count,Prompt_Polarity,Prompt_subjectivity,Prompt_Lexical_Diversity,Prompt_Grammar_errors,Prompt_Grammar_score,Prompt_Clarity_score,Prompt_Coherence_score
0,Factual,What is the tallest mountain in Africa?,39,7,0.0,0.0,1.0,0,100.0,0.0,0.15937
1,Factual,List three differences between viruses and bac...,52,7,0.0,0.0,1.0,0,100.0,0.0,0.187638
2,Factual,Describe the causes and effects of the French ...,57,9,0.0,0.0,0.888889,0,100.0,0.0,0.243827
3,Factual,What are the main responsibilities of the Unit...,57,9,0.166667,0.333333,0.888889,0,100.0,0.0,0.20622
4,Factual,Explain the economic consequences of inflation...,71,9,0.2,0.2,1.0,0,100.0,0.0,0.485444


In [10]:
df = df.replace('N/A', 0)
df.head()

Unnamed: 0,Category,Prompt,Prompt_length,Prompt_Word_count,Prompt_Polarity,Prompt_subjectivity,Prompt_Lexical_Diversity,Prompt_Grammar_errors,Prompt_Grammar_score,Prompt_Clarity_score,Prompt_Coherence_score
0,Factual,What is the tallest mountain in Africa?,39,7,0.0,0.0,1.0,0,100.0,0.0,0.15937
1,Factual,List three differences between viruses and bac...,52,7,0.0,0.0,1.0,0,100.0,0.0,0.187638
2,Factual,Describe the causes and effects of the French ...,57,9,0.0,0.0,0.888889,0,100.0,0.0,0.243827
3,Factual,What are the main responsibilities of the Unit...,57,9,0.166667,0.333333,0.888889,0,100.0,0.0,0.20622
4,Factual,Explain the economic consequences of inflation...,71,9,0.2,0.2,1.0,0,100.0,0.0,0.485444


In [12]:
# rounding of the evalution metrics to 3 places
df['Prompt_Polarity'] = df['Prompt_Polarity'].apply(lambda x: round(x, 3))
df['Sentiment_subjectivity'] = df['Prompt_subjectivity'].apply(lambda x: round(x,3))
df['Prompt_Lexical_Diversity'] = df['Prompt_Lexical_Diversity'].apply(lambda x: round(x,3))
df.head()

Unnamed: 0,Category,Prompt,Prompt_length,Prompt_Word_count,Prompt_Polarity,Prompt_subjectivity,Prompt_Lexical_Diversity,Prompt_Grammar_errors,Prompt_Grammar_score,Prompt_Clarity_score,Prompt_Coherence_score,Sentiment_subjectivity
0,Factual,What is the tallest mountain in Africa?,39,7,0.0,0.0,1.0,0,100.0,0.0,0.15937,0.0
1,Factual,List three differences between viruses and bac...,52,7,0.0,0.0,1.0,0,100.0,0.0,0.187638,0.0
2,Factual,Describe the causes and effects of the French ...,57,9,0.0,0.0,0.889,0,100.0,0.0,0.243827,0.0
3,Factual,What are the main responsibilities of the Unit...,57,9,0.167,0.333333,0.889,0,100.0,0.0,0.20622,0.333
4,Factual,Explain the economic consequences of inflation...,71,9,0.2,0.2,1.0,0,100.0,0.0,0.485444,0.2


In [13]:
df.to_csv("Prompt_Evalmetrics.csv", index=False)