**CHITCHAT TEST SET**

In [None]:
import pickle
from sklearn.model_selection import train_test_split

dataset = pd.read_pickle('chitchat.pkl') 

train_data, test_data = train_test_split(dataset, test_size=0.2, random_state=7)

with open('chittest.pkl', 'wb') as test_file:
    pickle.dump(test_data, test_file)

**COSINE SIMILARITY TO GET GOLDEN RESPONSE - CHITCHAT**

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

df = pd.read_pickle('chitchat.pkl')

def similar(prompt):
  input_sentence = prompt

  vectorizer = TfidfVectorizer()
  prompt_vectors = vectorizer.fit_transform(df['prompt'])

  input_vector = vectorizer.transform([input_sentence])

  similarities = cosine_similarity(input_vector, prompt_vectors).flatten()

  most_similar_index = np.argmax(similarities)

  most_similar_prompt = df.loc[most_similar_index, 'prompt']
  most_similar_message = df.loc[most_similar_index, 'message']

  return most_similar_message

**BERT SCORE**

In [None]:
!pip install bert_score

In [None]:
from bert_score import score

def bertscore(response, goldresponse):
  reference_text = goldresponse
  generated_text = response

  bert_score = score(generated_text, reference_text, lang="en", model_type="bert-base-uncased")

  f1_score = bert_score[2][0].item()

  return f1_score

In [None]:
data = pd.read_pickle("chittest.pkl")
res = []
gold = []

for i,row in data.iterrows():
  prompt = row['prompt']
  response = generate_response(prompt)
  goldresponse = similar(prompt)
  res.append(response)
  gold.append(goldresponse)

In [7]:
bertsc = bertscore(res, gold)
bertsc = format(bertsc, ".2f")
print("BERT : ",bertsc)

BERT : 0.45


**PERPLEXITY**

In [None]:
from transformers import AutoTokenizer, AutoModelForMaskedLM
import torch

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModelForMaskedLM.from_pretrained("bert-base-uncased")

def calculate_perplexity(text):
    inputs = tokenizer.encode_plus(text, return_tensors='pt', padding=True, truncation=True)

    with torch.no_grad():
        outputs = model(**inputs)

    logits = outputs.logits
    loss = torch.nn.functional.cross_entropy(logits.view(-1, logits.size(-1)), inputs["input_ids"].view(-1))
    perplexity = torch.exp(loss)

    return perplexity.item()

In [5]:
data = pd.read_pickle("chittest.pkl")
perpl = 0

for i,row in data.iterrows():
  prompt = row['prompt']
  response = generate_response(prompt)
  perpl += calculate_perplexity(response)

perpl = perpl/len(data)
perpl = format(perpl, ".2f")
print(perpl/len(data))

PERPLEXITY : 15.95


**ROUGE**

In [None]:
!pip install rouge

In [None]:
from rouge import Rouge

def rougescore(response, goldresponse):
  reference_text = goldresponse
  generated_text = response

  rouge = Rouge()

  scores = rouge.get_scores(generated_text, reference_text)

  rouge_1_score = scores[0]["rouge-1"]["f"]
  rouge_2_score = scores[0]["rouge-2"]["f"]
  rouge_l_score = scores[0]["rouge-l"]["f"]

  return rouge_1_score, rouge_2_score, rouge_l_score

In [6]:
data = pd.read_pickle("chittest.pkl")
rougesc1 = 0
rougesc2 = 0
rougescl = 0

for i,row in data.iterrows():
  prompt = row['prompt']
  response = generate_response(prompt)
  goldresponse = similar(prompt)
  x,y,z = rougescore(response, goldresponse)
  rougesc1+=x
  rougesc2+=y
  rougescl+=z

rougesc1 = format(rougesc1, ".2f")
rougesc2 = format(rougesc2, ".2f")
rougescl = format(rougescl, ".2f")

print("ROUGE 1 : ",rougesc1/len(data))
print("ROUGE 2 : ",rougesc2/len(data))
print("ROUGE L : ",rougescl/len(data))

ROUGE 1 : 0.27
ROUGE 2 : 0.11
ROUGE L : 0.17


**BLEU**

In [None]:
from nltk.translate.bleu_score import sentence_bleu
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def bleuscore(response, goldresponse):

  reference_sentence = goldresponse
  generated_sentence = response

  reference_tokens = reference_sentence.split()
  generated_tokens = generated_sentence.split()

  bleu_score = sentence_bleu([reference_tokens], generated_tokens, weights=(0.9,0.1))

  return bleu_score

In [8]:
data = pd.read_pickle("chittest.pkl")
bleusc = 0
res = []
gold = []

for i,row in data.iterrows():
  prompt = row['prompt']
  response = generate_response(prompt)
  goldresponse = similar(prompt)
  res.append(response)
  gold.append(goldresponse)
  bleusc = bleuscore(res, gold)

bleusc = format(bleusc, ".2f")
print("BLEU : ",bleusc)

BLEU : 0.14


**EMPATHIC TEST SET**

In [None]:
import pickle
from sklearn.model_selection import train_test_split

dataset = pd.read_pickle('empathicdatafull.pkl') 

train_data, test_data = train_test_split(dataset, test_size=0.2, random_state=7)

with open('empathictest.pkl', 'wb') as test_file:
    pickle.dump(test_data, test_file)

**COSINE SIMILARITY TO GET GOLDEN RESPONSE - EMPATHIC**

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

df = pd.read_pickle('empathicdatafull.pkl')

def similar(prompt):
  input_sentence = prompt

  vectorizer = TfidfVectorizer()
  prompt_vectors = vectorizer.fit_transform(df['prompt'])

  input_vector = vectorizer.transform([input_sentence])

  similarities = cosine_similarity(input_vector, prompt_vectors).flatten()

  most_similar_index = np.argmax(similarities)

  most_similar_prompt = df.loc[most_similar_index, 'prompt']
  most_similar_message = df.loc[most_similar_index, 'message']

  return most_similar_message

**BERT SCORE**

In [None]:
!pip install bert_score

In [None]:
from bert_score import score

def bertscore(response, goldresponse):
  reference_text = goldresponse
  generated_text = response

  print(len(reference_text))
  print(len(generated_text))

  bert_score = score(generated_text, reference_text, lang="en", model_type="bert-base-uncased")

  f1_score = bert_score[2][0].item()

  return f1_score

In [None]:
data = pd.read_pickle("empathictest.pkl")
res = []
gold = []

for i,row in data.iterrows():
  prompt = row['prompt']
  response = generate_response(prompt)
  goldresponse = similar(prompt)
  res.append(response)
  gold.append(goldresponse)

In [9]:
bertsc = bertscore(res, gold)
bertsc = format(bertsc, ".2f")
print("BERT : ",bertsc)

BERT : 0.62


**PERPLEXITY**

In [None]:
from transformers import AutoTokenizer, AutoModelForMaskedLM
import torch

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModelForMaskedLM.from_pretrained("bert-base-uncased")

def calculate_perplexity(text):
    inputs = tokenizer.encode_plus(text, return_tensors='pt', padding=True, truncation=True)

    with torch.no_grad():
        outputs = model(**inputs)

    logits = outputs.logits
    loss = torch.nn.functional.cross_entropy(logits.view(-1, logits.size(-1)), inputs["input_ids"].view(-1))
    perplexity = torch.exp(loss)

    return perplexity.item()

In [10]:
data = pd.read_pickle("empathictest.pkl")
perpl = 0

for i,row in data.iterrows():
  prompt = row['prompt']
  response = generate_response(prompt)
  perpl += calculate_perplexity(response)

perpl = perpl/len(data)
perpl = format(perpl, ".2f")
print(perpl/len(data))

PERPLEXITY : 15.71


**ROUGE**

In [None]:
!pip install rouge

In [None]:
from rouge import Rouge

def rougescore(response, goldresponse):
  reference_text = goldresponse
  generated_text = response

  rouge = Rouge()

  scores = rouge.get_scores(generated_text, reference_text)

  rouge_1_score = scores[0]["rouge-1"]["f"]
  rouge_2_score = scores[0]["rouge-2"]["f"]
  rouge_l_score = scores[0]["rouge-l"]["f"]

  return rouge_1_score, rouge_2_score, rouge_l_score

In [11]:
data = pd.read_pickle("empathictest.pkl")
rougesc1 = 0
rougesc2 = 0
rougescll = 0

for i,row in data.iterrows():
  prompt = row['prompt']
  response = generate_response(prompt)
  goldresponse = similar(prompt)
  x,y,z = rougescore(response, goldresponse)
  rougesc1+=x
  rougesc2+=y
  rougescll+=z

rougesc1 = format(rougesc1, ".2f")
rougesc2 = format(rougesc2, ".2f")
rougescl = format(rougescl, ".2f")

print("ROUGE 1 : ",rougesc1/len(data))
print("ROUGE 2 : ",rougesc2/len(data))
print("ROUGE L : ",rougescl/len(data))

ROUGE 1 : 0.26
ROUGE 2 : 0.07
ROUGE L : 0.19


**BLEU**

In [None]:
from nltk.translate.bleu_score import sentence_bleu
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def bleuscore(response, goldresponse):

  reference_sentence = goldresponse
  generated_sentence = response

  reference_tokens = reference_sentence.split()
  generated_tokens = generated_sentence.split()

  bleu_score = sentence_bleu([reference_tokens], generated_tokens, weights=(0.9,0.1))

  return bleu_score

In [12]:
data = pd.read_pickle("empathictest.pkl")
bleusc = 0
res = []
gold = []

for i,row in data.iterrows():
  prompt = row['prompt']
  response = generate_response(prompt)
  goldresponse = similar(prompt)
  res.append(response)
  gold.append(goldresponse)
  bleusc = bleuscore(res, gold)

bleusc = format(bleusc, ".2f")
print("BLEU : ",bleusc)

BLEU : 0.19
