In [1]:
import os
import pandas as pd

os.chdir("../../")
from scripts.llm import get_num_of_tokens, get_completion
from scripts.utils import save_obj_as_pickle, read_obj_from_pickle
from scripts.data import make_database, make_prompts_for_clf

pd.set_option("display.max_colwidth", 150)

In [2]:
def loadData(fp):
    data = []
    for line in open(fp).readlines()[1:]:
        data.append([s.strip() for s in line.split("\t")])

    df = pd.DataFrame(data, columns=["label", "id1", "id2", "text1", "text2"])
    df = df[["label", "text1", "text2"]]
    df.label = df.label.map({"0": "No", "1": "Yes"})
    df["text"] = "Text A: " + df.text1 + "\nText B: " + df.text2 + "\n"
    df = df[['label', 'text']]

    return df

clf_task = "MRPC"
test_fp = f"data/raw/text classification/{clf_task}/test.tsv"
test_df = loadData(test_fp)

dev_fp = f"data/raw/text classification/{clf_task}/val.tsv"
dev_df = loadData(dev_fp)

test_df.head()

Unnamed: 0,label,text
0,Yes,"Text A: PCCW's chief operating officer, Mike Butcher, and Alex Arena, the chief financial officer, will report directly to Mr So.\nText B: Current..."
1,Yes,Text A: The world's two largest automakers said their U.S. sales declined more than predicted last month as a late summer sales frenzy caused more...
2,Yes,"Text A: According to the federal Centers for Disease Control and Prevention (news - web sites), there were 19 reported cases of measles in the Uni..."
3,No,Text A: A tropical storm rapidly developed in the Gulf of Mexico Sunday and was expected to hit somewhere along the Texas or Louisiana coasts by M...
4,No,Text A: The company didn't detail the costs of the replacement and repairs.\nText B: But company officials expect the costs of the replacement wor...


In [3]:
test_df.label.value_counts()

Yes    1147
No      578
Name: label, dtype: int64

In [4]:
(test_df.label.value_counts() / len(test_df)).round(3) * 100

Yes    66.5
No     33.5
Name: label, dtype: float64

In [4]:
test_df.text.apply(get_num_of_tokens).describe()

count    1725.000000
mean       55.552464
std        13.058420
min        21.000000
25%        46.000000
50%        55.000000
75%        65.000000
max       108.000000
Name: text, dtype: float64

In [5]:
dev_df.label.value_counts()

Yes    346
No     154
Name: label, dtype: int64

In [6]:
dev_df.text.apply(get_num_of_tokens).describe()

count    500.000000
mean      55.472000
std       12.616003
min       24.000000
25%       47.000000
50%       55.000000
75%       64.000000
max       92.000000
Name: text, dtype: float64

### Database

Sources for the prompt data

In [7]:
database = make_database(test_df, dev_df, num_instance=500, max_instance_size=500)
database.keys()

dict_keys(['num_instance', 'max_instance_size', 'labels', 'testData', 'testInstances', 'devData', 'devInstances'])

#### Zero-shot

In [26]:
database["promptTemplates"] = dict()
database["promptTemplates"]["zero-shot"] = dict()


single_clf = "Compare text A with text B and determine if text A is a paraphrase of text B. " \
             "Respond with 'Yes' if text A is a paraphrase, and 'No' if it is not.\n\n" \
             "$text\nAnswer:"

batch_clf = "Compare text A with text B for the following $num text pairs and determine if text A is a paraphrase of text B line by line. " \
            "Respond with 'Yes' if text A is a paraphrase, and 'No' if it is not. Provide your answers line by line.\n\n" \
            "$texts\nAnswers:\n" 

index_selection_one_cat_a_time = "Go over the $num text pairs below and list the index numbers of the text pairs where text A $be a paraphrase of text B according to the following instructions:\n" \
                                 "If none of the text pairs satisfy this condition, write 'None.'\n" \
                                 "If all the text pairs satisfy this condition, write 'All.'\n" \
                                 "Otherwise, provide the index numbers of the text pairs where text A $be a paraphrase of text B, each on a separate line.\n\n" \
                                 "Here are the text pairs:\n\n$texts\n" \
                                 "'None,' 'All,' or the index numbers of the text pairs where text A $be a paraphrase of text B:\n"

index_selection_one_cat_a_time_json =   "Go over the $num text pairs below and list the index numbers of the text pairs where text A $be a paraphrase of text B according to the following instructions:\n" \
                                        "If none of the text pairs satisfy this condition, write 'None.'\n" \
                                        "If all the text pairs satisfy this condition, write 'All.'\n" \
                                        "Otherwise, provide the index numbers of the text pairs where text A $be a paraphrase of text B.\n\n" \
                                        "Output your responses in JSON format with the key 'answer'.\nA formatted example output is provided below.\n" \
                                        "{'answer': [None/All or index numbers of the text pairs where text A $be a paraphrase of text B]}\n\n" \
                                        "Here are the text pairs:\n\n$texts\n" \
                                        "JSON output:\n"

index_selection_all_cat_at_once = "Go over the $num text pairs below. First, list the index numbers of the text pairs that contain paraphrases. " \
                                  "Then, list the index numbers of the text pairs that contain non-paraphrases.\n" \
                                  "If none of the text pairs satisfy a condition, write 'None.'\n" \
                                  "If all the text pairs satisfy a condition, write 'All.'\n" \
                                  "Otherwise, provide the index numbers of the text pairs that satisfy each condition.\n\n" \
                                  "Here are the text pairs:\n\n$texts\n" \
                                  "Output your responses in JSON format with two keys: 'yes' for paraphrases and 'no' for non-paraphrases." \
                                  "\nA formatted example output is provided below.\n" \
                                  "{'yes': [None/All or index numbers of text pairs that contain paraphrases], " \
                                  "'no': [None/All or index numbers of text pairs that contain non-paraphrases]}"

index_selection_all_cat_at_once_adjusted = "Go over the $num text pairs below. First, list the index numbers of the text pairs that contain paraphrases. " \
                                            "Then, list the index numbers of the text pairs that contain non-paraphrases.\n" \
                                            "If none of the text pairs satisfy a condition, write 'None.'\n" \
                                            "If all the text pairs satisfy a condition, write 'All.'\n" \
                                            "Otherwise, provide the index numbers of the text pairs that satisfy each condition.\n\n" \
                                            "Output your responses in JSON format with two keys: 'yes' for paraphrases and 'no' for non-paraphrases." \
                                            "\nA formatted example output is provided below.\n" \
                                            "{'yes': [None/All or index numbers of text pairs that contain paraphrases], " \
                                            "'no': [None/All or index numbers of text pairs that contain non-paraphrases]}\n\n" \
                                            "Here are the text pairs:\n\n$texts\n" \
                                            "JSON output:\n"

tasks = ["single_clf", "batch_clf", "index_selection_one_cat_a_time", 
         "index_selection_all_cat_at_once", "index_selection_all_cat_at_once_adjusted", "index_selection_one_cat_a_time_json"]
promptTemplates = [single_clf, batch_clf, index_selection_one_cat_a_time, 
                   index_selection_all_cat_at_once, index_selection_all_cat_at_once_adjusted, index_selection_one_cat_a_time_json]

for task, tmp in zip(tasks, promptTemplates):
    database["promptTemplates"]["zero-shot"][task] = tmp

In [27]:
os.makedirs("data/databases/text classification/", exist_ok=True)
save_obj_as_pickle(database, f"data/databases/text classification/{clf_task}.pkl")

Saved object to data/databases/text classification/MRPC.pkl


#### Test Prompts

- The main purpose is to check if LLMs can output the desired formats given the prompts 

In [13]:
dev = []
num_instance = 2
label_attr_converter = lambda t: {"Yes": "is", "No": "isn't"}[t]
taskSizes = [3, 5]
for propmtMode in ["zero-shot"]:
    for task in tasks:
        if task == "single_clf":
            dev.append(make_prompts_for_clf(database, task, "dev", propmtMode)[:num_instance])
            continue

        for taskSize in taskSizes:  
            dev.append(make_prompts_for_clf(database, task, "dev", propmtMode, taskSize, attr="be", 
                                            label_attr_converter=label_attr_converter, num_instance=num_instance))

dev = pd.concat(dev).reset_index(drop=True)

In [14]:
for p in dev[(dev["taskSize"] <= 3) & (dev["taskIndex"] == 1)].prompt:
    print(p)
    print("-"*50)
    print()

Compare text A with text B and determine if text A is a paraphrase of text B. Respond with 'Yes' if text A is a paraphrase, and 'No' if it is not.

Text A: Stocks have rallied sharply for more than three months in anticipation of a rebound in the second half of the year.
Text B: Stocks have rallied sharply for more than three months in anticipation of an economic rebound in the year's second half.

Answer:
--------------------------------------------------

Compare text A with text B for the following 3 text pairs and determine if text A is a paraphrase of text B line by line. Respond with 'Yes' if text A is a paraphrase, and 'No' if it is not. Provide your answers line by line.

1. Text A: "I still want to be the candidate for guys with Confederate flags in their pickup trucks," Dean said Friday in a telephone interview from New Hampshire.
Text B: "I still want to be the candidate for guys with Confederate flags in their pickup trucks," he told The Des Moines Register.

2. Text A: Llo

In [15]:
dev["preds"] = dev.prompt.apply(get_completion)
dev

Unnamed: 0,taskIndex,prompt,answer,targetLabel,task,#shot,CoT,taskSize,preds
0,1,"Compare text A with text B and determine if text A is a paraphrase of text B. Respond with 'Yes' if text A is a paraphrase, and 'No' if it is not....",Yes,,single_clf,0,False,1,Yes
1,2,"Compare text A with text B and determine if text A is a paraphrase of text B. Respond with 'Yes' if text A is a paraphrase, and 'No' if it is not....",No,,single_clf,0,False,1,No
2,1,Compare text A with text B for the following 3 text pairs and determine if text A is a paraphrase of text B line by line. Respond with 'Yes' if te...,"[Yes, Yes, Yes]",,batch_clf,0,False,3,1. No\n2. No\n3. Yes
3,2,Compare text A with text B for the following 3 text pairs and determine if text A is a paraphrase of text B line by line. Respond with 'Yes' if te...,"[Yes, Yes, Yes]",,batch_clf,0,False,3,1. No\n2. Yes\n3. No
4,1,Compare text A with text B for the following 5 text pairs and determine if text A is a paraphrase of text B line by line. Respond with 'Yes' if te...,"[Yes, Yes, Yes, Yes, Yes]",,batch_clf,0,False,5,1. No\n2. No\n3. Yes\n4. No\n5. Yes
5,2,Compare text A with text B for the following 5 text pairs and determine if text A is a paraphrase of text B line by line. Respond with 'Yes' if te...,"[Yes, Yes, Yes, Yes, Yes]",,batch_clf,0,False,5,1. No\n2. Yes\n3. No\n4. No\n5. Yes
6,1,Go over the 3 text pairs below and list the index numbers of the text pairs where text A isn't a paraphrase of text B according to the following i...,{None},No,index_selection_one_cat_a_time,0,False,3,None.
7,1,Go over the 3 text pairs below and list the index numbers of the text pairs where text A is a paraphrase of text B according to the following inst...,{All},Yes,index_selection_one_cat_a_time,0,False,3,1. 1\n2. None\n3. All
8,2,Go over the 3 text pairs below and list the index numbers of the text pairs where text A isn't a paraphrase of text B according to the following i...,{None},No,index_selection_one_cat_a_time,0,False,3,None.
9,2,Go over the 3 text pairs below and list the index numbers of the text pairs where text A is a paraphrase of text B according to the following inst...,{All},Yes,index_selection_one_cat_a_time,0,False,3,1\n2\nNone


### Adjusted

In [28]:
dev = []
num_instance = 2

taskSizes = [3, 5]
label_attr_converter = lambda t: {"Yes": "is", "No": "isn't"}[t]

for propmtMode in ["zero-shot"]:
    for task in tasks[-1:]:
        if task == "single_clf":
            dev.append(make_prompts_for_clf(database, task, "dev", propmtMode)[:num_instance])
            continue

        for taskSize in taskSizes:  
            dev.append(make_prompts_for_clf(database, task, "dev", propmtMode, taskSize, attr="be", 
                                            label_attr_converter=label_attr_converter, num_instance=num_instance))

dev = pd.concat(dev).reset_index(drop=True)

In [30]:
for p in dev[(dev["taskSize"] <= 3) & (dev["taskIndex"] == 1)].prompt:
    print(p)
    print("-"*50)
    print()

Go over the 3 text pairs below and list the index numbers of the text pairs where text A isn't a paraphrase of text B according to the following instructions:
If none of the text pairs satisfy this condition, write 'None.'
If all the text pairs satisfy this condition, write 'All.'
Otherwise, provide the index numbers of the text pairs where text A isn't a paraphrase of text B.

Output your responses in JSON format with the key 'answer'.
A formatted example output is provided below.
{'answer': [None/All or index numbers of the text pairs where text A isn't a paraphrase of text B]}

Here are the text pairs:

1. Text A: "I still want to be the candidate for guys with Confederate flags in their pickup trucks," Dean said Friday in a telephone interview from New Hampshire.
Text B: "I still want to be the candidate for guys with Confederate flags in their pickup trucks," he told The Des Moines Register.

2. Text A: Lloyds TSB confirmed on Tuesday it was "considering its options" over the sale

In [31]:
model = "meta-llama/Llama-3-8b-chat-hf"
dev["preds"] = dev.prompt.apply(lambda p: get_completion(p, model))
dev

Unnamed: 0,taskIndex,prompt,answer,targetLabel,task,#shot,CoT,taskSize,preds
0,1,Go over the 3 text pairs below and list the index numbers of the text pairs where text A isn't a paraphrase of text B according to the following i...,{None},No,index_selection_one_cat_a_time_json,0,False,3,"Here is the output in JSON format:\n\n{\n""answer"": [2]\n}\n\nExplanation:\n\n* Text pair 1: Text A and Text B are identical, so they are paraphras..."
1,1,Go over the 3 text pairs below and list the index numbers of the text pairs where text A is a paraphrase of text B according to the following inst...,{All},Yes,index_selection_one_cat_a_time_json,0,False,3,"Here is the output in JSON format:\n\n{\n""answer"": [2]\n}\n\nExplanation:\n\n* Text pair 1: Text A and Text B are identical, but the source of the..."
2,2,Go over the 3 text pairs below and list the index numbers of the text pairs where text A isn't a paraphrase of text B according to the following i...,{None},No,index_selection_one_cat_a_time_json,0,False,3,"Here is the output in JSON format:\n\n{\n""answer"": [2]\n}\n\nExplanation:\n\n* Text pair 1: Text A and Text B are paraphrases of each other, as th..."
3,2,Go over the 3 text pairs below and list the index numbers of the text pairs where text A is a paraphrase of text B according to the following inst...,{All},Yes,index_selection_one_cat_a_time_json,0,False,3,"Here is the output in JSON format:\n\n{\n""answer"": [2]\n}\n\nExplanation:\n\n* Text pair 1: Text A is not a paraphrase of Text B, as the wording a..."
4,1,Go over the 5 text pairs below and list the index numbers of the text pairs where text A isn't a paraphrase of text B according to the following i...,{None},No,index_selection_one_cat_a_time_json,0,False,5,"Here is the output in JSON format:\n\n{\n""answer"": [2, 4]\n}\n\nExplanation:\n\n* Text pair 1: Text A and Text B are paraphrases of each other.\n*..."
5,1,Go over the 5 text pairs below and list the index numbers of the text pairs where text A is a paraphrase of text B according to the following inst...,{All},Yes,index_selection_one_cat_a_time_json,0,False,5,"Here is the output in JSON format:\n\n{\n""answer"": [2, 4]\n}\n\nExplanation:\n\n* Text pair 2: Text A is a paraphrase of Text B, as both sentences..."
6,2,Go over the 5 text pairs below and list the index numbers of the text pairs where text A isn't a paraphrase of text B according to the following i...,{None},No,index_selection_one_cat_a_time_json,0,False,5,"Here is the output in JSON format:\n\n{\n""answer"": [2, 4]\n}\n\nExplanation:\n\n* Text pair 1: Text A and Text B are paraphrases of each other.\n*..."
7,2,Go over the 5 text pairs below and list the index numbers of the text pairs where text A is a paraphrase of text B according to the following inst...,{All},Yes,index_selection_one_cat_a_time_json,0,False,5,"Here is the output in JSON format:\n\n{\n""answer"": [2, 4]\n}\n\nExplanation:\n\n* Text pair 2: Text A is a paraphrase of Text B, as they both desc..."


### Make prompts

In [32]:
database = read_obj_from_pickle(f"data/databases/text classification/{clf_task}.pkl")

Read object from data/databases/text classification/MRPC.pkl


In [17]:
# out = []
# num_instance = 100
# taskSizes = [3, 5, 10, 20, 50]
# for propmtMode in ["zero-shot"]:
#     for task in tasks:

#         if task == "single_clf":
#             out.append(make_prompts_for_clf(database, task, "test", propmtMode))
#             continue

#         for taskSize in taskSizes:
#             out.append(make_prompts_for_clf(database, task, "test", propmtMode, taskSize, attr="be", 
#                                             label_attr_converter=label_attr_converter, num_instance=num_instance))

# out = pd.concat(out)
# out.reset_index(drop=True, inplace=True)

# os.makedirs("results/text classification/", exist_ok=True)
# out.to_json(f"results/text classification/{clf_task}.json", orient="records", lines=True)

In [37]:
fp = f"results/text classification/{clf_task}.json"
out = [pd.read_json(fp, lines=True)]

num_instance = 100
taskSizes = [3, 5, 10, 20, 50]
for propmtMode in ["zero-shot"]:
    for taskSize in taskSizes:
        out.append(make_prompts_for_clf(database, tasks[-1], "test", propmtMode, taskSize, attr="be", 
                                        label_attr_converter=label_attr_converter, num_instance=num_instance))

out = pd.concat(out)
out.reset_index(drop=True, inplace=True)

os.makedirs("results/text classification/", exist_ok=True)
out.to_json(f"results/text classification/{clf_task}.json", orient="records", lines=True)                                             

In [38]:
out.task.value_counts()

single_clf                                  1725
index_selection_one_cat_a_time              1000
index_selection_one_cat_a_time_json         1000
batch_clf                                    500
index_selection_all_cat_at_once              500
index_selection_all_cat_at_once_adjusted     500
Name: task, dtype: int64

In [39]:
out.prompt.apply(get_num_of_tokens).describe()

count    5225.000000
mean      798.788900
std       950.927107
min        64.000000
25%       108.000000
50%       395.000000
75%      1207.000000
max      3265.000000
Name: prompt, dtype: float64

In [40]:
sub = out.copy()[(out.taskIndex == 1) & (out.taskSize <= 3)]
sub

Unnamed: 0,taskIndex,prompt,answer,targetLabel,task,#shot,CoT,taskSize,gpt-3.5-turbo-0125-completion,meta-llama/Llama-3-70b-chat-hf-completion,mistralai/Mixtral-8x7B-Instruct-v0.1-completion,meta-llama/Llama-3-8b-chat-hf-completion,gpt-4-turbo-2024-04-09-completion,lmsys/vicuna-13b-v1.5-completion,mistralai/Mistral-7B-Instruct-v0.2-completion
0,1,"Compare text A with text B and determine if text A is a paraphrase of text B. Respond with 'Yes' if text A is a paraphrase, and 'No' if it is not....",Yes,,single_clf,0,False,1,Yes,Yes,"Yes, text A is a paraphrase of text B. Both texts convey the same information, stating that Mike Butcher, the Chief Operating Officer, and Alex A...",Yes,Yes,Yes.,"Yes, text A is a paraphrase of text B. Both texts convey the same meaning, which is that Mike Butcher and Alex Arena report directly to So. The o..."
1725,1,Compare text A with text B for the following 3 text pairs and determine if text A is a paraphrase of text B line by line. Respond with 'Yes' if te...,"[No, Yes, Yes]",,batch_clf,0,False,3,1. No\n2. No\n3. Yes,Here are the comparisons:\n\n**Pair 1:**\n\n* Text A: China's Health Ministry said five more people had died of Sars and a further 159 were infect...,"1. Text A: China's Health Ministry said five more people had died of Sars and a further 159 were infected.\n Text B: On Monday, China said nine...",Here are the comparisons:\n\n1. Text A: China's Health Ministry said five more people had died of Sars and a further 159 were infected.\nText B: O...,1. No\n2. Yes\n3. Yes,1. No\n2. Yes\n3. Yes,"1. Yes, both texts report an increase in deaths and infections from SARS, with slight differences in wording.\n2. Yes, both texts report Intel's ..."
2225,1,Go over the 3 text pairs below and list the index numbers of the text pairs where text A isn't a paraphrase of text B according to the following i...,[1],No,index_selection_one_cat_a_time,0,False,3,None.,1,"The answers for each text pair:\n\n1. The texts are not exact paraphrases, but they convey the same information and use a similar structure. Text...","After analyzing the text pairs, I found that:\n\n* Text A and Text B in pair 1 are paraphrases of each other, as they both report the same informa...",1,None.,"None.\n\nIn all three text pairs, text A can be considered a paraphrase of text B, as both texts convey the same essential information using diff..."
2226,1,Go over the 3 text pairs below and list the index numbers of the text pairs where text A is a paraphrase of text B according to the following inst...,"[2, 3]",Yes,index_selection_one_cat_a_time,0,False,3,1. None\n2. None\n3. 3,2\n3,1.,"After analyzing the text pairs, I found that:\n\n* Text A in pair 1 is not a paraphrase of Text B, as the numbers of deaths and infections are dif...",3,None.,"None.\n\nWhile the text pairs share similarities, they are not exact paraphrases of each other. The differences in word choice, sentence structur..."
3225,1,"Go over the 3 text pairs below. First, list the index numbers of the text pairs that contain paraphrases. Then, list the index numbers of the text...","{'no': [1], 'yes': [2, 3]}",,index_selection_all_cat_at_once,0,False,3,"{\n ""yes"": [1],\n ""no"": [2, 3]\n}","Here is the output:\n\n{'yes': [1, 2, 3], 'no': []}\n\nExplanation:\n\n* Text pair 1 contains paraphrases, as both texts report on the same topic ...","{\n 'yes': [1, 3],\n 'no': [2]\n}","Here is the output in JSON format:\n\n{\n""yes"": [1, 3],\n""no"": [2]\n}\n\nExplanation:\n\n* Text pair 1 contains paraphrases, as both texts report ...","{\n ""yes"": [3],\n ""no"": [1, 2]\n}","{\n""yes"": [1, 3],\n""no"": [2]\n}","{'yes': [1], 'no': [2, 3]}\n\nExplanation:\nText pair 1 contains paraphrases as both texts report the number of deaths and new infections from SA..."
3725,1,"Go over the 3 text pairs below. First, list the index numbers of the text pairs that contain paraphrases. Then, list the index numbers of the text...","{'no': [1], 'yes': [2, 3]}",,index_selection_all_cat_at_once_adjusted,0,False,3,,,,,,,
4225,1,Go over the 3 text pairs below and list the index numbers of the text pairs where text A isn't a paraphrase of text B according to the following i...,{1},No,index_selection_one_cat_a_time_json,0,False,3,,,,,,,
4226,1,Go over the 3 text pairs below and list the index numbers of the text pairs where text A is a paraphrase of text B according to the following inst...,"{2, 3}",Yes,index_selection_one_cat_a_time_json,0,False,3,,,,,,,


In [41]:
# out["lmsys/vicuna-13b-v1.5-completion"].str.contains("TOO_").sum()

0