**CHAIN OF THOUGHT PROMPTING**

This file contains code to generate literal translations using Chain of Thought (CoT) prompting. We use KNN to get the two most closely resembling samples for our few shot CoT prompt and leverage the explanation for each of these examples as reasoning for the literal translation.


You will need to add your OpenAI API key to get the results of prompting.
For classsification, the finetuned model is linked to our account. Please ask for the OpenAI API to run the classification if needed.

In [None]:
!pip install openai
!pip install datasets
!pip install sentence_transformers
!pip install git+https://github.com/google-research/bleurt.git
!pip install transformers
!pip install bert_score
!git clone https://github.com/google-research/bleurt.git
%cd bleurt
!pip install .
%cd /content/
!wget https://storage.googleapis.com/bleurt-oss-21/BLEURT-20.zip .
!unzip BLEURT-20.zip
!pip install faiss-gpu

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting openai
  Downloading openai-0.27.6-py3-none-any.whl (71 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m71.9/71.9 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
Collecting aiohttp (from openai)
  Downloading aiohttp-3.8.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m67.0 MB/s[0m eta [36m0:00:00[0m
Collecting multidict<7.0,>=4.5 (from aiohttp->openai)
  Downloading multidict-6.0.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (114 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m114.5/114.5 kB[0m [31m15.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting async-timeout<5.0,>=4.0.0a3 (from aiohttp->openai)
  Downloading async_timeout-4.0.2-py3-none-any.whl (5.8 kB)
Collecting yarl<2.0,>=1.0 (from aiohttp->openai)
  Downloadin

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import openai
import os
import pandas as pd
import time
import re
from datasets import load_metric, load
from bleurt import score as bleurt_score
from bert_score import score as bert_score
import string
from sentence_transformers import SentenceTransformer, util
import faiss

In [None]:
import torch
import random
import numpy as np

torch.manual_seed(0)
torch.cuda.manual_seed(0)
np.random.seed(0)
random.seed(0)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

# Confirm that the GPU is detected
assert torch.cuda.is_available()

# Get the GPU device name.
device_name = torch.cuda.get_device_name()
n_gpu = torch.cuda.device_count()
print(f"Found device: {device_name}, n_gpu: {n_gpu}")
device = torch.device("cuda")

Found device: Tesla T4, n_gpu: 1


In [None]:
test_df = pd.read_csv('../Datasets/Prompting/prompting_test.csv')
test_df["Output"] = ""

In [None]:
train_df = pd.read_csv('../Datasets/Prompting/train-0.3.csv')
# removing datapoints with no explanation
train_df = train_df[train_df.apply(lambda x : pd.isnull(x['Explanation']) != True, axis=1)]

In [None]:
train_df_no_premise = train_df.dropna(subset = ["Premise"])

In [None]:
classification_key = "<replace_with_your_key>"
prompting_key = "<ask_for_our_model_key>"
openai.api_key = prompting_key

In [None]:
def get_type(text):
  openai.api_key = classification_key
  res = openai.Completion.create(model='ada:ft-personal-2023-05-15-01-40-58', prompt=text, temperature=0, top_p=1.0, max_tokens = 100)
  return get_pred(res, ["Sarcasm","Simile","Metaphor","Idiom"])

In [None]:
def get_pred(res, labels):
  text = res["choices"][0]["text"].translate(str.maketrans('', '', string.punctuation))
  pred_tokens = text.split(" ")
  pred = "Metaphor"
  for i in pred_tokens:
    if i in labels:
      pred = i
      break
  return pred.replace("'", "")

In [None]:
model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = np.array([model.encode(train_df_no_premise.iloc[0].Hypothesis, convert_to_numpy=True)])
print(embeddings.shape)
for i in range(1,train_df_no_premise.shape[0]):
  sample = train_df_no_premise.iloc[i].Hypothesis
  embeddings = np.append(embeddings,np.array([model.encode(sample, convert_to_numpy=True)]),axis=0)

(1, 384)


In [None]:
# select the nearest neighbours from the train dataset for examples in few shot cot
def select_cot_example(train_df, fig_type, fig_input):
  # train_df =  train_df[train_df.apply(lambda x : x['Type'] == fig_type, axis=1)]
  # sample = train_df.sample(ignore_index = True, n=2)
  index = faiss.IndexFlatL2(embeddings.shape[1])
  index.add(embeddings)
  embed = np.array([model.encode(fig_input, convert_to_numpy=True)])
  D, I = index.search(embed, 2)
  neighbours = I[0]
  return neighbours

In [None]:
def create_prompt(fig_input, fig_type, neighbours, df):
  prompt = ""
  for ind in neighbours:
    prompt = prompt + "Give literal translation of this " + (df.iloc[ind]['Type']).lower() + ": " + df.iloc[ind]['Hypothesis'] + "\nReasoning: " + df.iloc[ind]['Explanation'] + "\nThis is why the sentence is a " + (df.iloc[ind]['Type']).lower() + ".\nLiteral translation: " + df.iloc[ind]['Premise'] + "\n\n"
  prompt = prompt + "Give literal translation of this " +  fig_type.lower() + ": " + fig_input + "\nReasoning:"
  return prompt

In [None]:
# classify the figurative input and select an example for chain of thought
# generate a prompt using the example and figurative input
# call completion model for CoT prompting

def chain_of_thought(fig_input):
  # replace with classification code
  fig_type = get_type(fig_input)
  
  neighbours = select_cot_example(train_df, fig_type, fig_input)
  
  prompt = create_prompt(fig_input, fig_type, neighbours, train_df)
  print(prompt, end = "")
  
  lit_output = openai.Completion.create(
    model="text-davinci-003",
    prompt=prompt, 
    max_tokens = 1500,
  )
  print(lit_output.choices[0].text + '\n')
  return lit_output.choices[0].text

In [None]:
for ind in test_df.index:
    fig_input = test_df['Hypothesis'][ind]
    lit_output = chain_of_thought(fig_input)
    test_df.loc[ind, ['Output']] = lit_output
    time.sleep(1)
    print('-----------')
print("DONE!")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
This is why the sentence is a sarcasm.
Literal translation: I hate the sound of people loudly breathing while they eat a sandwich.

-----------
Give literal translation of this sarcasm: I was ecstatic when I found out the chicken I ordered was still pink on the inside
Reasoning: Finding out that the chicken someone ordered was still pink means that it was not cooked properly and getting uncooked food doesn't make one happy.
This is why the sentence is a sarcasm.
Literal translation: I was annoyed when I found out the chicken I ordered was still pink on the inside

Give literal translation of this sarcasm: I felt extremely relieved after seeing that the chicken in my sandwich was still very pink/red in the middle
Reasoning: Undercooked chicken can cause food poisoning and so it is not a relieving thing to see it still pink or red in the middle as that means it is not cooked properly
This is why the sentence is a sarcasm.
L

In [None]:
test_df

Unnamed: 0.1,Unnamed: 0,Index,Hypothesis,Premise,Type,Explanation,Label,SarcasmSimilarity,Source,Output
0,331,50,I heard a knock on my door at 2 am in the morn...,I heard a knock on my door at 2 am in the morn...,Sarcasm,Unknown people knocking on someone's door at 2...,Contradiction,UNQ,FLUTE,Hearing a knock on the door at 2 am in the mo...
1,233,1630,I was very proud of my sense of judgement when...,I was seriously doubting my sense of judgement...,Sarcasm,: Someone who is admired for their great visio...,Contradiction,UNQ,FLUTE,This statement is ironic because it would be ...
2,66,2112,I adore it when people blast their music durin...,I despise it when people blast their music dur...,Sarcasm,People blasting their music during the wee hou...,Contradiction,UNQ,FLUTE,Playing loud music in the middle of the night...
3,374,1833,I am pleased because the last thing I need is ...,I've been getting really bad headaches recently,Sarcasm,Having a headache makes someone's day worse as...,Contradiction,UNQ,FLUTE,A headache is really a painful and unpleasant...
4,452,1057,I was extremely happy to find out that all my ...,I was extremely broken to find out that all my...,Sarcasm,Studying for an annual exam is often a waste o...,Contradiction,UNQ,FLUTE,Studying for an annual exam is often a necess...
...,...,...,...,...,...,...,...,...,...,...
395,332,3260,I know that woman too well to ever want to lay...,I know that woman too well to ever want to hur...,Idiom,To lay a finger on someone usually refers to h...,Entailment,UNQ,FLUTE,"This is an idiom that essentially means ""I kn..."
396,527,7919,‘ That spoke volumes for the character as well...,‘ That provided substantial information for th...,Idiom,,Entailment,,IMPLI,This idiom means to convey something importan...
397,569,6303,It 's all water under the bridge ; you do n't ...,It 's all in the past ; you do n't want to hea...,Idiom,,Entailment,,IMPLI,This idiom is commonly used to mean that some...
398,222,6481,The road and railway from Mallaig go south han...,The road and railway from Mallaig go south in ...,Idiom,,Entailment,,IMPLI,"Hand in hand means to be together, linked or ..."


In [None]:
for ind in test_df.index:
  lit_output = test_df['Output'][ind]
  lit_output = lit_output.strip()
  reasoning = lit_translation = ''
  if re.search("literal translation:", lit_output, re.IGNORECASE):
    reasoning, lit_translation = re.split("literal translation:", lit_output, flags=re.IGNORECASE)
    reasoning = reasoning.strip()
    lit_translation = lit_translation.strip()
  else:
    reasoning = lit_translation = lit_output
  test_df.loc[ind, ['Reasoning']] = reasoning
  test_df.loc[ind, ['Literal Translation']] = lit_translation

In [None]:
test_df

Unnamed: 0.1,Unnamed: 0,Index,Hypothesis,Premise,Type,Explanation,Label,SarcasmSimilarity,Source,Output,Reasoning,Literal Translation
0,331,50,I heard a knock on my door at 2 am in the morn...,I heard a knock on my door at 2 am in the morn...,Sarcasm,Unknown people knocking on someone's door at 2...,Contradiction,UNQ,FLUTE,Hearing a knock on the door at 2 am in the mo...,Hearing a knock on the door at 2 am in the mor...,I was filled with dread when I heard a knock o...
1,233,1630,I was very proud of my sense of judgement when...,I was seriously doubting my sense of judgement...,Sarcasm,: Someone who is admired for their great visio...,Contradiction,UNQ,FLUTE,This statement is ironic because it would be ...,This statement is ironic because it would be m...,I was embarrassed of my decision making when I...
2,66,2112,I adore it when people blast their music durin...,I despise it when people blast their music dur...,Sarcasm,People blasting their music during the wee hou...,Contradiction,UNQ,FLUTE,Playing loud music in the middle of the night...,Playing loud music in the middle of the night ...,I hate when people play music at 3am in the mi...
3,374,1833,I am pleased because the last thing I need is ...,I've been getting really bad headaches recently,Sarcasm,Having a headache makes someone's day worse as...,Contradiction,UNQ,FLUTE,A headache is really a painful and unpleasant...,A headache is really a painful and unpleasant ...,I really don't want to get a headache today be...
4,452,1057,I was extremely happy to find out that all my ...,I was extremely broken to find out that all my...,Sarcasm,Studying for an annual exam is often a waste o...,Contradiction,UNQ,FLUTE,Studying for an annual exam is often a necess...,Studying for an annual exam is often a necessa...,I was extremely disappointed to find out that ...
...,...,...,...,...,...,...,...,...,...,...,...,...
395,332,3260,I know that woman too well to ever want to lay...,I know that woman too well to ever want to hur...,Idiom,To lay a finger on someone usually refers to h...,Entailment,UNQ,FLUTE,"This is an idiom that essentially means ""I kn...","This is an idiom that essentially means ""I kno...",I am very familiar with that woman so I have n...
396,527,7919,‘ That spoke volumes for the character as well...,‘ That provided substantial information for th...,Idiom,,Entailment,,IMPLI,This idiom means to convey something importan...,This idiom means to convey something important...,‘That communicated a lot about the personality...
397,569,6303,It 's all water under the bridge ; you do n't ...,It 's all in the past ; you do n't want to hea...,Idiom,,Entailment,,IMPLI,This idiom is commonly used to mean that some...,This idiom is commonly used to mean that somet...,It has already occurred and it cannot be chang...
398,222,6481,The road and railway from Mallaig go south han...,The road and railway from Mallaig go south in ...,Idiom,,Entailment,,IMPLI,"Hand in hand means to be together, linked or ...","Hand in hand means to be together, linked or a...",The road and railway from Mallaig run side by ...


In [None]:
def calculate_bleurt(df):
  # BLEURT calculation
  scorer = bleurt_score.BleurtScorer('/content/BLEURT-20')
  bleurt_scores = scorer.score(references=df['Premise'], candidates=df['Literal Translation'])
  print(bleurt_scores)
  return bleurt_scores

def calculate_bertscore(df):
  # BERTScore calculation
  bertscore = load_metric('bertscore')
  bert_scores = bertscore.compute(predictions=df['Literal Translation'], references=df['Premise'], lang="en")
  print(bert_scores['f1'])
  return bert_scores['f1']

def calculate_littransscore(bert_scores, bleurt_scores):
  lit_trans_scores = []
  for ind in range(len(bleurt_scores)):
    lit_trans_scores.append((bert_scores[ind] + bleurt_scores[ind]) * 50.0)
  return lit_trans_scores

In [None]:
bleurt_scores = calculate_bleurt(test_df)

[0.6178280115127563, 0.6765174865722656, 0.7004265189170837, 0.46850693225860596, 0.8834017515182495, 0.8567996025085449, 0.7406611442565918, 0.7667922973632812, 0.6042110323905945, 0.8005295395851135, 0.7726153135299683, 0.6836861371994019, 0.677539050579071, 0.7678931951522827, 0.8541590571403503, 0.7248182892799377, 0.8487015962600708, 0.7059642672538757, 0.6895152926445007, 0.631512463092804, 0.8042793869972229, 0.6896601319313049, 0.9113443493843079, 0.6894431114196777, 0.7652260661125183, 0.8794838786125183, 0.7452359795570374, 0.5509359240531921, 0.8178381323814392, 0.5757505893707275, 0.5300754308700562, 0.770721435546875, 0.610717236995697, 0.6190123558044434, 0.7171847820281982, 0.8315653204917908, 0.6932161450386047, 0.5826987624168396, 0.7987812161445618, 0.5733698010444641, 0.8765395283699036, 0.721207320690155, 0.7118167877197266, 0.8316414952278137, 0.7647915482521057, 0.712286114692688, 0.7023654580116272, 0.7694755792617798, 0.6437788605690002, 0.696672260761261, 0.800

In [None]:
bert_scores = calculate_bertscore(test_df)

  bertscore = load_metric('bertscore')


Downloading builder script:   0%|          | 0.00/2.92k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.43G [00:00<?, ?B/s]

[0.9489826560020447, 0.9293994307518005, 0.9524160623550415, 0.8876468539237976, 0.9887697696685791, 0.9710198640823364, 0.9592796564102173, 0.940705418586731, 0.9236050248146057, 0.9524333477020264, 0.9373872876167297, 0.9161515235900879, 0.9169740676879883, 0.9394298791885376, 0.9969666600227356, 0.9265087246894836, 0.9780773520469666, 0.9309607148170471, 0.9130401611328125, 0.9184983372688293, 0.9890386462211609, 0.9533452987670898, 0.990375816822052, 0.9515777230262756, 0.9767053127288818, 0.9938803911209106, 0.9516685009002686, 0.907016396522522, 0.9837223887443542, 0.9126981496810913, 0.9464542865753174, 0.9387954473495483, 0.9567882418632507, 0.9298005700111389, 0.9302237033843994, 0.9887343645095825, 0.9283172488212585, 0.9141194224357605, 0.9665529727935791, 0.9085427522659302, 0.9754564166069031, 0.9438025951385498, 0.9434091448783875, 0.9820259809494019, 0.9733522534370422, 0.9274487495422363, 0.9080031514167786, 0.9578512907028198, 0.9475492238998413, 0.9192537069320679, 0.

In [None]:
print("Range of BLEURT: ")
print(min(bleurt_scores))
print(max(bleurt_scores), end='\n')

print("Range of BERTScore: ")
print(min(bert_scores))
print(max(bert_scores), end='\n')

Range of BLEURT: 
0.24967145919799805
1.003772497177124
Range of BERTScore: 
0.843066394329071
1.0


In [None]:
# Compute literal translation score
lit_trans_scores = calculate_littransscore(bert_scores, bleurt_scores)
test_df['Score'] = lit_trans_scores
test_df['BLEURT'] = bleurt_scores
test_df['BERT'] = bert_scores

lit_trans_scores_scores_metaphor = test_df.groupby('Type').get_group('Metaphor')['Score']
lit_trans_scores_scores_idiom = test_df.groupby('Type').get_group('Idiom')['Score']
lit_trans_scores_scores_simile = test_df.groupby('Type').get_group('Simile')['Score']
lit_trans_scores_scores_sarcasm = test_df.groupby('Type').get_group('Sarcasm')['Score']

print(min(lit_trans_scores))
print(max(lit_trans_scores))

print('Average literal translation score for metaphor: %d' %(sum(lit_trans_scores_scores_metaphor)/len(lit_trans_scores_scores_metaphor)))
print('Average literal translation score for idiom: %d' %(sum(lit_trans_scores_scores_idiom)/len(lit_trans_scores_scores_idiom)))
print('Average literal translation score for simile: %d' %(sum(lit_trans_scores_scores_simile)/len(lit_trans_scores_scores_simile)))
print('Average literal translation score for sarcasm: %d' %(sum(lit_trans_scores_scores_sarcasm)/len(lit_trans_scores_scores_sarcasm)))


55.79249858856201
100.1886248588562
Average literal translation score for metaphor: 80
Average literal translation score for idiom: 82
Average literal translation score for simile: 75
Average literal translation score for sarcasm: 84


In [None]:
# Export results
test_zero_shot = test_df.loc[:,['Hypothesis','Literal Translation','Premise','Reasoning','Score','BLEURT','BERT','Type','Source']]
test_zero_shot.to_csv('../Results/Prompting/chain_of_thought_results_knn.csv')