## 1. Loading data files

In [17]:
import ast
import json
import csv
import sys
import os

# Append the path of the project root to sys.path
sys.path.append(os.path.abspath('../../'))

from src.data_handling.source_cleanup import source_conversation_data, context_conversation_data
from src.metrics.compute_cosine import compute_similarity
from src.metrics.compute_rouge import compute_rouge_scores
from src.metrics.compute_bleu import compute_bleu_score

data_path = '../../data'

non_equal_data = []
with open(f'{data_path}/cleaned_non_equal.csv', 'r') as f:
    reader = csv.reader(f, delimiter=',', quotechar='"')
    for line in reader:
        non_equal_data.append(line)

equal_data = []
with open(f'{data_path}/cleaned_equal.csv', 'r') as f:
    reader = csv.reader(f, delimiter=',', quotechar='"')
    for line in reader:
        equal_data.append(line)

In [None]:
!pwd

/Users/siddhant/Projects/cs-responses/src/exp


In [None]:
"""
Data formating: 
1. Column 1 [id] → Unique identifier column
2. Column 2 [prev_context] → Context of conversation
3. Column 3 [response] → AI Generated Response
4. Column 4 [agent_response] → Customer Support Agent Response (Human Response)
5. Column 5 [sources] → Knowledge base for AI agent to generate response.
"""

"""
Methods to evaluate LLM responses
1. BLEU Score (2/6)
2. ROGUE Score (2/6)
"""

### BLEU Score : This is generally a good metric for evaluating translation systems but a good v0 approach. [[ref]](https://aclanthology.org/P02-1040.pdf)

In [32]:
from nltk.translate.bleu_score import sentence_bleu

def compute_bleu(data_list):
    """method to compute bleu scores

    Args:
        data_list (list): data from csv file

    Returns:
        - unigram_score (float)
        - bigram_score (float)
        - trigram_score (float)
        - quadgram_score (float)
        - generalised_score (float)
    """
    # Step 1: Declarations
    unigram_scores = []
    bigram_scores = []
    trigram_scores = []
    quadgram_scores = []
    generalised_scores = []

    # Step 2: Iterate over the data_list
    for data in data_list:
        # weights here represent importance to n-gram
        # weight (1, 0, 0, 0) - importance only to uni-gram
        # weight (0, 0, 1, 0) - importance only to tri-gram
        unigram_scores.append(sentence_bleu([data[2]], data[3], weights=(1, 0, 0, 0)))
        bigram_scores.append(sentence_bleu([data[2]], data[3], weights=(0, 1, 0, 0)))
        trigram_scores.append(sentence_bleu([data[2]], data[3], weights=(0, 0, 1, 0)))
        quadgram_scores.append(sentence_bleu([data[2]], data[3], weights=(0, 0, 0, 1)))
        generalised_scores.append(sentence_bleu([data[2]], data[3]))
    
    # Step 3: Compute the n-gram scores
    unigram_score = sum(unigram_scores) / len(unigram_scores)
    bigram_score = sum(bigram_scores) / len(bigram_scores)
    trigram_score = sum(trigram_scores) / len(trigram_scores)
    quadgram_score = sum(quadgram_scores) / len(quadgram_scores)
    generalised_score = sum(generalised_scores) / len(generalised_scores)

    # Step 4: Return the n-gram scores
    return (unigram_score, bigram_score, trigram_score, quadgram_score, generalised_score)


### Rouge score

In [58]:
from rouge_score import rouge_scorer
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

def compute_rouge_scores(data_list):
    """method to compute rouge scores

    Args:
        data_list (list): data from csv file

    Returns:
    """
    scores = {
        'rouge1': {'precision': [], 'recall': [], 'fmeasure': []},
        'rouge2': {'precision': [], 'recall': [], 'fmeasure': []},
        'rougeL': {'precision': [], 'recall': [], 'fmeasure': []},
    }

    for data in data_list[:10]:
        score = scorer.score(data[2], data[3])
        for k, v in score.items():
            scores[k]['precision'].append(v.precision)
            scores[k]['recall'].append(v.recall)
            scores[k]['fmeasure'].append(v.fmeasure)
    
    for k,v in scores.items():
        scores[k]['precision'] = sum(scores[k]['precision']) / len(scores[k]['precision'])
        scores[k]['recall'] = sum(scores[k]['recall']) / len(scores[k]['recall'])
        scores[k]['fmeasure'] = sum(scores[k]['fmeasure']) / len(scores[k]['fmeasure'])
        
    return scores

In [47]:
from rouge_score import rouge_scorer
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

s = scorer.score('sample sentence', 'sample sentece 2')
s['rougeL']

Score(precision=0.3333333333333333, recall=0.5, fmeasure=0.4)

### Cosine Similarity - Computing similarity scores using text representations

In [29]:
equal_data[6]

['a28045e9-1d93-43db-adba-ae3eb516908f',
 "Customer's Message: Hello,\n\nI have just noticed the address is incorrect!\n\n12 Arthur street Tranmere SA is the correct address. Can you please update the shipping company. Looks like I can’t at my end.\nAgent's Message: Hi <PERSON>\nWe have received your query. One of our team members will get back to you shortly. We usually respond within <DATE_TIME>.\nIf your question is in regards to a purchased item, please provide your order number if you have not already done so.\nPlease visit\n<URL>\nto view our FAQ.\nThank you!\nMiansai Support\nCustomer's Message: Hello,\n\nThe address looks wrong 😑 12 Arthur street Tranmere SA\n",
 'Hi,\n\nThank you for reaching out. Unfortunately, we cannot update the order because it was already shipped on <DATE_TIME>. The tracking company is TNT, and you might want to contact them directly to see if they can assist further.\n\nBest regards,\n<PERSON>',
 'Hi,\nThank you for reaching out. Unfortunately, we canno

In [59]:
def compute_metrics_for_data(data_list):
    """
    Given the data list object
    compute various scores / metrics for data object

    Args:
        - data_list (list): Data from csv file
    """
    # Step 1: Compute bleu scores
    # unigram_score, bigram_score, trigram_score, quadgram_score, generalised_score = compute_bleu(data_list)
    # print(f'BLEU Score for corpus : {generalised_score}')
    # print(f'BLEU Score for corpus :\n\t- unigram_score: {unigram_score}\n\t- bigram_score: {bigram_score}\n\t- trigram_score: {trigram_score}\n\t- quadgram_score: {quadgram_score}')

    # Step 2: Compute rouge scores
    scores = compute_rouge_scores(data_list)
    print(scores)

In [60]:
compute_metrics_for_data(equal_data)
compute_metrics_for_data(non_equal_data)


{'rouge1': {'precision': 0.95, 'recall': 1.0, 'fmeasure': 0.9666666666666666}, 'rouge2': {'precision': 0.9, 'recall': 0.9, 'fmeasure': 0.9}, 'rougeL': {'precision': 0.95, 'recall': 1.0, 'fmeasure': 0.9666666666666666}}
{'rouge1': {'precision': 0.7170397304297738, 'recall': 0.6583446239807726, 'fmeasure': 0.6400066019093208}, 'rouge2': {'precision': 0.5355760602606692, 'recall': 0.47728833513663105, 'fmeasure': 0.48640880562768063}, 'rougeL': {'precision': 0.6608354345124942, 'recall': 0.6207622787897317, 'fmeasure': 0.6017041217453112}}


In [28]:
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer

scores = {
    'relevance': 0,
    'empathy': 0,
    'completeness': 0,
    'clarity': 0,
    'tone': 0,
    'personalization': 0,
    'grammatical_correctness': 0
    }

# Example scoring
# Compare relevance
# scores['relevance'] = compare_relevance(response, agent_response, prev_context)

# # Check for grammatical correctness
# scores['grammatical_correctness'] = check_grammar(response)

# Empathy score (could use sentiment analysis as a proxy)
sia = SentimentIntensityAnalyzer()
scores['empathy'] = sia.polarity_scores('I am so sorry to hear that')['compound']
print(scores)
# Aggregate score
total_score = sum(scores.values()) / len(scores)

{'relevance': 0, 'empathy': -0.1513, 'completeness': 0, 'clarity': 0, 'tone': 0, 'personalization': 0, 'grammatical_correctness': 0}


In [33]:
import spacy

# Load a medium-sized English model
nlp = spacy.load('en_core_web_md')

def compare_relevance(ai_response, human_response, context):
    # Convert text to spaCy document objects
    doc_context = nlp(context)
    doc_ai = nlp(ai_response)
    doc_human = nlp(human_response)
    
    # Calculate similarity scores
    ai_similarity = doc_context.similarity(doc_ai)
    human_similarity = doc_context.similarity(doc_human)
    
    # Compare and assign relevance scores
    relevance_score = {
        'ai_relevance': ai_similarity,
        'human_relevance': human_similarity
    }
    return relevance_score

# Example usage
context = "I need to return a damaged item I received yesterday."
ai_response = "You can start a return process by logging into your account."
human_response = "I'm sorry to hear that! To return the item, please log in and follow the instructions on our returns page."

compare_relevance(ai_response, human_response, context)


{'ai_relevance': 0.7192914192349255, 'human_relevance': 0.8101837959171697}

In [31]:
!pip install spacy


Collecting spacy
  Downloading spacy-3.7.4-cp39-cp39-macosx_11_0_arm64.whl.metadata (27 kB)
Collecting spacy-legacy<3.1.0,>=3.0.11 (from spacy)
  Downloading spacy_legacy-3.0.12-py2.py3-none-any.whl.metadata (2.8 kB)
Collecting spacy-loggers<2.0.0,>=1.0.0 (from spacy)
  Downloading spacy_loggers-1.0.5-py3-none-any.whl.metadata (23 kB)
Collecting murmurhash<1.1.0,>=0.28.0 (from spacy)
  Downloading murmurhash-1.0.10-cp39-cp39-macosx_11_0_arm64.whl.metadata (2.0 kB)
Collecting cymem<2.1.0,>=2.0.2 (from spacy)
  Downloading cymem-2.0.8-cp39-cp39-macosx_11_0_arm64.whl.metadata (8.4 kB)
Collecting preshed<3.1.0,>=3.0.2 (from spacy)
  Downloading preshed-3.0.9-cp39-cp39-macosx_11_0_arm64.whl.metadata (2.2 kB)
Collecting thinc<8.3.0,>=8.2.2 (from spacy)
  Downloading thinc-8.2.3-cp39-cp39-macosx_11_0_arm64.whl.metadata (15 kB)
Collecting wasabi<1.2.0,>=0.9.1 (from spacy)
  Downloading wasabi-1.1.2-py3-none-any.whl.metadata (28 kB)
Collecting srsly<3.0.0,>=2.4.3 (from spacy)
  Downloading srsl

In [34]:
"""
1. AI <> HUMAN RESPONSE similarity
    - cosine 
    - rouge
    - bleu
    - NLI Entailment score
2. Context <> AI RESPONSE
    - cosine
"""

[['id', 'prev_context', 'response', 'agent_response', 'sources'],
 ['9eb89242-c9a8-496f-aa5a-78aee3b1724e',
  "Customer's Message: I have not got an refound\n\nSent from\nOutlook for iOS\n****************************************\nFrom:\nPupRing Customer Care\nSent:\n<DATE_TIME> 6:41:28 PM\nTo:\n<EMAIL_ADDRESS>\nSubject:\nRe: Order\n\nDear <PERSON>,\n\nYes, a refund has been processed already. Please check your\naccount for the confirmation. We apologize for any\ninconvenience and hope to serve you better in the future.\n\nWarm regards,\n\nPupRing Customer Care\n\n<PERSON> from <LOCATION>'s Message: Dear <PERSON>,\nYes, a refund has been processed already. Please check your account for the confirmation. We apologize for any inconvenience and hope to serve you better in the future.\nWarm regards,\nPupRing Customer Care\nCustomer's Message: <PERSON> so i will get a refound?\n\nSent from\nOutlook for iOS\n****************************************\nFrom:\nPupRing Customer Care\nSent:\n<DATE_

In [2]:
from sentence_transformers import SentenceTransformer
sentences_1 = ["Customer's Message: Hi,\n\n<LOCATION> <PERSON> enig <PERSON> ongeveer binnenkomt?\n\nMet vriendelijke groet,\n\n<PERSON>\n", "Dear <PERSON>,\n\nThank you for reaching out. I'm pleased to inform you that your order has been fulfilled. If you have not yet received a tracking update, please let us know and we will ensure you get the necessary information.\n\nKind regards,\n<PERSON><LOCATION>"]
sentences_2 = ["Customer's Message: Hi,\n\n<LOCATION> <PERSON> enig <PERSON> ongeveer binnenkomt?\n\nMet vriendelijke groet,\n\n<PERSON>\n", "Dear <PERSON>,\n\nThank you for reaching out. I'm pleased to inform you that your order has been fulfilled. If you have not yet received a tracking update, please let us know and we will ensure you get the necessary information.\n\nKind regards,\n<PERSON><LOCATION>"]
model = SentenceTransformer('BAAI/bge-base-en-v1.5')
embeddings_1 = model.encode(sentences_1[1], normalize_embeddings=True)
embeddings_2 = model.encode(sentences_2[1], normalize_embeddings=True)
similarity = embeddings_1 @ embeddings_2.T
print(similarity)

1.0


In [2]:
from nltk.sentiment import SentimentIntensityAnalyzer

def measure_empathy(response):
    sia = SentimentIntensityAnalyzer()
    scores = sia.polarity_scores(response)
    empathy_score = scores['compound']
    return empathy_score

# Example usage
response = ""

empathy_score = measure_empathy(response)
print(f"Empathy score: {empathy_score}")


Empathy score: 0.0


In [18]:
import ast
import json
import csv
import sys
import os
from matplotlib import pyplot as plt
import seaborn as sns
sns.set()

data_path = '../../data'

# Step 1: Read data from the structured JSON files for equal data
with open(f'{data_path}/structured_equal.json', 'r') as f:
    equal_data = json.load(f)  # Load the data from JSON file into a Python list

# Step 2: Read data from the structured JSON files for non-equal data
with open(f'{data_path}/structured_non_equal.json', 'r') as f:
    non_equal_data = json.load(f)

In [19]:
equal_data[4]

{'id': 'd608e6bc-44dd-4016-82a8-4d1128195b1b',
 'prev_context_conversation': [{'customer': 'can I have one more air hipper? thanks'},
  {'agent': "Hi <PERSON>,\nI'm reaching out to inform you that there was a system error on our end in which we've had to cancel and refund your order. So sorry about this! If you would like to avoid the steps of re-creating the cart, I am happy to send over an invoice to you. However, the French Cut is out of stock. I would need to know the style you would like to purchase as the replacement.\nOur apologies again for the inconvenience!, and I'm looking forward to hearing from you."}],
 'ai_response': "Hi <PERSON>,\n\nI'm sorry for the inconvenience. Unfortunately, much of our inventory is low at the moment. I would recommend placing a new order on our website, you might find something else you like. Once you've placed your order, please provide me with the order number and I will refund your shipping fee as a way to apologize for this inconvenience.\n\nB

In [20]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('BAAI/bge-base-en-v1.5')

In [21]:
import random
idx = random.randint(1,1000)
print(idx)
prev = equal_data[idx]['prev_context_conversation']
prev_conversation_text = ''
for msg in prev:
    prev_conversation_text += f' {list(msg.values())[0]}'

similarity = compute_similarity(model, equal_data[idx]['ai_response'], prev_conversation_text)
print(f"similarity score: {similarity}")

prev_conversation_text += equal_data[idx]['ai_response']
empathy_score = measure_empathy(prev_conversation_text)
print(f"Empathy score: {empathy_score}")

704
similarity score: 0.814781904220581
Empathy score: 0.998


In [25]:
equal_data[455]
from nltk.translate.bleu_score import sentence_bleu

In [35]:
import random
idx = random.randint(1,1000)
print(idx)

"""
prev_conversation_text <> source_conversation_text - cosine
C-AI : 
  - prev_conversation_text + ai_response - empathy
  - prev_conversation_text <> ai_response - cosine
C-H : 
  - prev_conversation_text + human_response - empathy
  - prev_conversation_text <> human_response - cosine
S-AI : 
  - source_conversation_text + ai_response - empathy
  - source_conversation_text <> ai_response - cosine
S-H : 
  - source_conversation_text + human_response - empathy
  - source_conversation_text <> human_response - cosine
AI-H :
    - ai_response <> human_response - cosine
    - ai_response <> human_response - bleu (5 values)
    - ai_response <> human_response - rouge (3 values)
"""

scores = {}

# data_obj = equal_data
data_obj = non_equal_data

for idx in range(len(data_obj)-4, len(data_obj)):
  scores = {}

  prev = data_obj[idx]['prev_context_conversation']
  prev_conversation_text = ''
  for msg in prev:
      prev_conversation_text += f' {list(msg.values())[0]}'

  source = data_obj[idx]['source_conversation']
  source_conversation_text = ''
  for msg in source:
      source_conversation_text += f' {list(msg.values())[0]}'

  ai_response = data_obj[idx]['ai_response']
  human_response = data_obj[idx]['human_response']

  source_context_similarity = compute_similarity(model, source_conversation_text, prev_conversation_text)
  print(f"prev_conversation_text <> source_conversation_text - cosine score: {similarity}")
  print('--------')
  scores['source_context_similarity'] = source_context_similarity


  ai_prev_conversation_text = prev_conversation_text + ai_response
  ai_context_empathy = measure_empathy(ai_prev_conversation_text)
  scores['ai_context_empathy'] = ai_context_empathy
  print(f"prev_conversation_text + ai_response - empathy score: {ai_context_empathy}")

  ai_context_similarity = compute_similarity(model, prev_conversation_text, ai_response)
  print(f"prev_conversation_text <> ai_response - cosine score: {ai_context_similarity}")
  scores['ai_context_similarity'] = ai_context_similarity
  print('--------')

  human_prev_conversation_text = prev_conversation_text + human_response
  human_context_empathy = measure_empathy(human_prev_conversation_text)
  print(f"prev_conversation_text + human_response - empathy score: {human_context_empathy}")
  scores['human_context_empathy'] = human_context_empathy

  human_context_similarity = compute_similarity(model, prev_conversation_text, human_response)
  print(f"prev_conversation_text <> human_response - cosine score: {human_context_similarity}")
  scores['human_context_similarity'] = human_context_similarity
  print('--------')

  ai_source_conversation_text = source_conversation_text + ai_response
  empathy_score = measure_empathy(ai_source_conversation_text)
  print(f"source_conversation_text + ai_response - empathy score: {empathy_score}")
  similarity = compute_similarity(model, source_conversation_text, ai_response)
  print(f"source_conversation_text <> ai_response - cosine score: {similarity}")
  print('--------')

  human_source_conversation_text = source_conversation_text + human_response
  empathy_score = measure_empathy(human_source_conversation_text)
  print(f"source_conversation_text + human_response - empathy score: {empathy_score}")
  similarity = compute_similarity(model, source_conversation_text, human_response)
  print(f"source_conversation_text <> human_response - cosine score: {similarity}")
  print('--------')
  similarity = compute_similarity(model, ai_response, human_response)
  print(f"ai_response <> human_response - cosine score: {similarity}")
  rouge = compute_rouge_scores(ai_response, human_response)
  print(f"ai_response <> human_response - rouge score: {rouge}")
  bleu = sentence_bleu([ai_response], human_response, weights=(0.25,0.25,0.25,0.25))
  print(f"ai_response <> human_response - bleu score: {bleu}")

825
prev_conversation_text <> source_conversation_text - cosine score: 0.7554283738136292
--------
prev_conversation_text + ai_response - empathy score: 0.9934
prev_conversation_text <> ai_response - cosine score: 0.6414504051208496
--------
prev_conversation_text + human_response - empathy score: 0.9908
prev_conversation_text <> human_response - cosine score: 0.7641831040382385
--------
source_conversation_text + ai_response - empathy score: 0.8703
source_conversation_text <> ai_response - cosine score: 0.8451265096664429
--------
source_conversation_text + human_response - empathy score: 0.7518
source_conversation_text <> human_response - cosine score: 0.8164573907852173
--------
ai_response <> human_response - cosine score: 0.6503177881240845
ai_response <> human_response - rouge score: {'rouge1': Score(precision=0.5294117647058824, recall=0.32727272727272727, fmeasure=0.4044943820224719), 'rouge2': Score(precision=0.24242424242424243, recall=0.14814814814814814, fmeasure=0.18390804