In [1]:
import os
import pandas as pd

os.chdir("../../")
from scripts.llm import get_num_of_tokens, get_completion
from scripts.utils import save_obj_as_pickle, read_obj_from_pickle
from scripts.data import make_database, make_prompts_for_clf

pd.set_option("display.max_colwidth", 150)

In [2]:
def loadData(fp):
    df = pd.read_table(fp, header=None, names=["source", "label", "note", "text"])[["label", "text"]]
    df.label = df.label.map({0: "Unacceptable", 1: "Acceptable"})
    return df

clf_task = "CoLA"
# the public CoLA test set is not labelled so we use the dev set as test set
test_fp = f"data/raw/text classification/{clf_task}/dev.tsv"
test_df = loadData(test_fp)

# the dev set can be used to finetune prompts and other hyperameters 
# may also be used to construct few-shot and CoT exemplars
dev_fp = f"data/raw/text classification/{clf_task}/train.tsv"
dev_df = loadData(dev_fp)

test_df.head()

Unnamed: 0,label,text
0,Acceptable,The sailors rode the breeze clear of the rocks.
1,Acceptable,The weights made the rope stretch over the pulley.
2,Acceptable,The mechanical doll wriggled itself loose.
3,Acceptable,"If you had eaten more, you would want less."
4,Unacceptable,"As you eat the most, you want the least."


In [4]:
test_df.label.value_counts()

Acceptable      721
Unacceptable    322
Name: label, dtype: int64

In [5]:
test_df.text.apply(get_num_of_tokens).describe()

count    1043.000000
mean        9.629914
std         4.575570
min         3.000000
25%         7.000000
50%         8.000000
75%        11.500000
max        33.000000
Name: text, dtype: float64

In [6]:
dev_df.label.value_counts()

Acceptable      6023
Unacceptable    2528
Name: label, dtype: int64

In [7]:
dev_df.text.apply(get_num_of_tokens).describe()

count    8551.000000
mean        9.362882
std         3.982960
min         2.000000
25%         7.000000
50%         9.000000
75%        11.000000
max        45.000000
Name: text, dtype: float64

### Database

Sources for the prompt data

In [56]:
database = make_database(test_df, dev_df, num_instance=500, max_instance_size=500)
database.keys()

dict_keys(['num_instance', 'max_instance_size', 'labels', 'testData', 'testInstances', 'devData', 'devInstances'])

#### Zero-shot prompts

In [4]:
database["promptTemplates"] = dict()
database["promptTemplates"]["zero-shot"] = dict()

single_clf = "Indicate the grammatical acceptability for the following line of text. " \
             "The acceptability shall be either 'Acceptable' or 'Unacceptable.'\n\n" \
             "Text: $text\nGrammatical acceptability:"

batch_clf = "Indicate the grammatical acceptabilities for each of the $num following lines of text. " \
            "The acceptability shall be either 'Acceptable' or 'Unacceptable.'\n\nTexts, one per line:\n\n" \
            "$texts\n\nGrammatical acceptabilities for each of the $num lines of text, one per line:\n"

index_selection_one_cat_a_time = "Go over the $num lines of text below and list the index numbers of the lines that are grammatically $acceptability according to the following instructions:\n" \
                                 "If none of the texts are grammatically $acceptability, write 'None.'\n" \
                                 "If all the texts are grammatically $acceptability, write 'All.'\n" \
                                 "Otherwise, provide the index numbers for each grammatically $acceptability text, each on a separate line.\n\n" \
                                 "Texts, one per line:\n\n$texts\n\n" \
                                 "'None,' 'All,' or the index numbers for the grammatically $acceptability texts, one per line:\n"

index_selection_one_cat_a_time_json =   "Go over the $num lines of text below and list the index numbers of the lines that are grammatically $acceptability according to the following instructions:\n" \
                                        "If none of the texts are grammatically $acceptability, write 'None.'\n" \
                                        "If all the texts are grammatically $acceptability, write 'All.'\n" \
                                        "Otherwise, provide the index numbers for each grammatically $acceptability text.\n\n" \
                                        "Output your responses in JSON format with the key '$acceptability'.\nA formatted example output is provided below.\n" \
                                        "{'$acceptability': [None/All or index numbers of $acceptability sentences]}\n\n" \
                                        "Texts, one per line:\n\n$texts\n\n" \
                                        "JSON output:\n"

index_selection_all_cat_at_once = "Go over the $num lines of text below. First, list the index numbers of the lines that are grammatically acceptable. " \
                                  "Then, list the index numbers of the lines that are grammatically unacceptable.\n" \
                                  "If none of the sentences show a particular acceptability, write 'None.'\n" \
                                  "If all the sentences show a particular acceptability, write 'All.'\n" \
                                  "Otherwise, provide the index numbers of the texts that fit a particular category.\n" \
                                  "Output your responses in JSON format with two keys 'acceptable' and 'unacceptable.'\nA formatted example output is provided below. \n" \
                                  "{'acceptable': [None/All or index numbers of acceptable texts], 'unacceptable': [None/All or index numbers of unacceptable texts]}" \
                                  "\n\nTexts, one per line:\n\n$texts\n\n" \
                                  "JSON output:\n" 
                                  
tasks = ["single_clf", "batch_clf", "index_selection_one_cat_a_time", "index_selection_all_cat_at_once", "index_selection_one_cat_a_time_json"]
promptTemplates = [single_clf, batch_clf, index_selection_one_cat_a_time, index_selection_all_cat_at_once, index_selection_one_cat_a_time_json]

for task, tmp in zip(tasks, promptTemplates):
    database["promptTemplates"]["zero-shot"][task] = tmp

In [5]:
os.makedirs("data/databases/text classification/", exist_ok=True)
save_obj_as_pickle(database, f"data/databases/text classification/{clf_task}.pkl")

Saved object to data/databases/text classification/CoLA.pkl


#### Test Prompts

- The main purpose is to check if LLMs can output the desired formats given the prompts 

In [12]:
dev = []
num_instance = 2

taskSizes = [5, 10]
for propmtMode in ["zero-shot"]:
    for task in tasks:
        if task == "single_clf":
            dev.append(make_prompts_for_clf(database, task, "dev", propmtMode)[:num_instance])
            continue

        for taskSize in taskSizes:  
            dev.append(make_prompts_for_clf(database, task, "dev", propmtMode, taskSize, attr="acceptability", 
                                            label_attr_converter=None, num_instance=num_instance))

dev = pd.concat(dev).reset_index(drop=True)

In [17]:
for p in dev[(dev["taskSize"] <= 5) & (dev["taskIndex"] == 1)].prompt:
    print(p)
    print("-"*50)
    print()

Indicate the grammatical acceptability for the following line of text. The acceptability shall be either 'Acceptable' or 'Unacceptable.'

Text: Our friends won't buy this analysis, let alone the next one we propose.
Grammatical acceptability:
--------------------------------------------------

Indicate the grammatical acceptabilities for each of the 5 following lines of text. The acceptability shall be either 'Acceptable' or 'Unacceptable.'

Texts, one per line:

1. The employees staffed the store.
2. Jack disappeared in a mysterious manner and Marion disappeared in one too.
3. Dorothy is needing new shoes.
4. Ellen told a story at Helen.
5. The money which I have hopes that the company will squander amounts to $400,000.

Grammatical acceptabilities for each of the 5 lines of text, one per line:

--------------------------------------------------

Go over the 5 lines of text below and list the index numbers of the lines that are grammatically acceptable according to the following instr

In [14]:
dev["preds"] = dev.prompt.apply(get_completion)
dev

Unnamed: 0,taskIndex,prompt,answer,targetLabel,task,#shot,CoT,taskSize,preds
0,1,Indicate the grammatical acceptability for the following line of text. The acceptability shall be either 'Acceptable' or 'Unacceptable.'\n\nText: ...,Acceptable,,single_clf,0,False,1,Acceptable
1,2,Indicate the grammatical acceptability for the following line of text. The acceptability shall be either 'Acceptable' or 'Unacceptable.'\n\nText: ...,Acceptable,,single_clf,0,False,1,Acceptable
2,1,Indicate the grammatical acceptabilities for each of the 5 following lines of text. The acceptability shall be either 'Acceptable' or 'Unacceptabl...,"[Acceptable, Unacceptable, Unacceptable, Unacceptable, Acceptable]",,batch_clf,0,False,5,1. Acceptable\n2. Acceptable\n3. Unacceptable\n4. Unacceptable\n5. Acceptable
3,2,Indicate the grammatical acceptabilities for each of the 5 following lines of text. The acceptability shall be either 'Acceptable' or 'Unacceptabl...,"[Unacceptable, Acceptable, Acceptable, Unacceptable, Acceptable]",,batch_clf,0,False,5,1. Acceptable\n2. Acceptable\n3. Unacceptable\n4. Unacceptable\n5. Acceptable
4,1,Indicate the grammatical acceptabilities for each of the 10 following lines of text. The acceptability shall be either 'Acceptable' or 'Unacceptab...,"[Acceptable, Unacceptable, Unacceptable, Unacceptable, Acceptable, Acceptable, Acceptable, Acceptable, Unacceptable, Acceptable]",,batch_clf,0,False,10,1. Acceptable\n2. Unacceptable\n3. Unacceptable\n4. Unacceptable\n5. Acceptable\n6. Unacceptable\n7. Acceptable\n8. Acceptable\n9. Unacceptable\n1...
5,2,Indicate the grammatical acceptabilities for each of the 10 following lines of text. The acceptability shall be either 'Acceptable' or 'Unacceptab...,"[Unacceptable, Acceptable, Acceptable, Unacceptable, Acceptable, Unacceptable, Acceptable, Acceptable, Acceptable, Acceptable]",,batch_clf,0,False,10,1. Acceptable\n2. Acceptable\n3. Unacceptable\n4. Unacceptable\n5. Acceptable\n6. Unacceptable\n7. Acceptable\n8. Acceptable\n9. Acceptable\n10. A...
6,1,Go over the 5 lines of text below and list the index numbers of the lines that are grammatically acceptable according to the following instruction...,"{1, 5}",Acceptable,index_selection_one_cat_a_time,0,False,5,1\n5
7,1,Go over the 5 lines of text below and list the index numbers of the lines that are grammatically unacceptable according to the following instructi...,"{2, 3, 4}",Unacceptable,index_selection_one_cat_a_time,0,False,5,3\n4
8,2,Go over the 5 lines of text below and list the index numbers of the lines that are grammatically acceptable according to the following instruction...,"{2, 3, 5}",Acceptable,index_selection_one_cat_a_time,0,False,5,1\n2\n4\n5
9,2,Go over the 5 lines of text below and list the index numbers of the lines that are grammatically unacceptable according to the following instructi...,"{1, 4}",Unacceptable,index_selection_one_cat_a_time,0,False,5,2\n3\n4


### Adjusted

In [6]:
dev = []
num_instance = 2

taskSizes = [5, 10]
for propmtMode in ["zero-shot"]:
    for task in tasks[-1:]:
        if task == "single_clf":
            dev.append(make_prompts_for_clf(database, task, "dev", propmtMode)[:num_instance])
            continue

        for taskSize in taskSizes:  
            dev.append(make_prompts_for_clf(database, task, "dev", propmtMode, taskSize, attr="acceptability", 
                                            label_attr_converter=None, num_instance=num_instance))

dev = pd.concat(dev).reset_index(drop=True)

In [7]:
for p in dev[(dev["taskSize"] <= 5) & (dev["taskIndex"] == 1)].prompt:
    print(p)
    print("-"*50)
    print()

Go over the 5 lines of text below and list the index numbers of the lines that are grammatically acceptable according to the following instructions:
If none of the texts are grammatically acceptable, write 'None.'
If all the texts are grammatically acceptable, write 'All.'
Otherwise, provide the index numbers for each grammatically acceptable text.

Output your responses in JSON format with the key 'acceptable'.
A formatted example output is provided below.
{'acceptable': [None/All or index numbers of acceptable sentences]}

Texts, one per line:

1. The employees staffed the store.
2. Jack disappeared in a mysterious manner and Marion disappeared in one too.
3. Dorothy is needing new shoes.
4. Ellen told a story at Helen.
5. The money which I have hopes that the company will squander amounts to $400,000.

JSON output:

--------------------------------------------------

Go over the 5 lines of text below and list the index numbers of the lines that are grammatically unacceptable accordi

In [8]:
model = "meta-llama/Llama-3-8b-chat-hf"
dev["preds"] = dev.prompt.apply(lambda p: get_completion(p, model))
dev

Unnamed: 0,taskIndex,prompt,answer,targetLabel,task,#shot,CoT,taskSize,preds
0,1,Go over the 5 lines of text below and list the index numbers of the lines that are grammatically acceptable according to the following instruction...,"{1, 5}",Acceptable,index_selection_one_cat_a_time_json,0,False,5,"After analyzing the texts, I found that only one line is grammatically acceptable. Here is the output in JSON format:\n\n{'acceptable': [1]}"
1,1,Go over the 5 lines of text below and list the index numbers of the lines that are grammatically unacceptable according to the following instructi...,"{2, 3, 4}",Unacceptable,index_selection_one_cat_a_time_json,0,False,5,"After reviewing the texts, I found that the following lines are grammatically unacceptable:\n\n{'unacceptable': [2, 3, 4, 5]}\n\nHere's a brief ex..."
2,2,Go over the 5 lines of text below and list the index numbers of the lines that are grammatically acceptable according to the following instruction...,"{2, 3, 5}",Acceptable,index_selection_one_cat_a_time_json,0,False,5,"After analyzing the texts, I found that not all of them are grammatically acceptable. Here is the output in JSON format:\n\n{\n""acceptable"": [2, 5..."
3,2,Go over the 5 lines of text below and list the index numbers of the lines that are grammatically unacceptable according to the following instructi...,"{1, 4}",Unacceptable,index_selection_one_cat_a_time_json,0,False,5,"After reviewing the texts, I found that the following lines are grammatically unacceptable:\n\n{'unacceptable': [3]}\n\nThe third line ""The socks ..."
4,1,Go over the 10 lines of text below and list the index numbers of the lines that are grammatically acceptable according to the following instructio...,"{1, 5, 6, 7, 8, 10}",Acceptable,index_selection_one_cat_a_time_json,0,False,10,"After reviewing the texts, I found that the following lines are grammatically acceptable:\n\n{'acceptable': [2, 7, 10]}\n\nThe texts at index numb..."
5,1,Go over the 10 lines of text below and list the index numbers of the lines that are grammatically unacceptable according to the following instruct...,"{9, 2, 3, 4}",Unacceptable,index_selection_one_cat_a_time_json,0,False,10,"After reviewing the texts, I found the following grammatically unacceptable lines:\n\n{'unacceptable': [4, 3, 6, 9]}\n\nHere's a brief explanation..."
6,2,Go over the 10 lines of text below and list the index numbers of the lines that are grammatically acceptable according to the following instructio...,"{2, 3, 5, 7, 8, 9, 10}",Acceptable,index_selection_one_cat_a_time_json,0,False,10,"After reviewing the texts, I found that not all of them are grammatically acceptable. Here is the output in JSON format:\n\n{\n""acceptable"": [2, 7..."
7,2,Go over the 10 lines of text below and list the index numbers of the lines that are grammatically unacceptable according to the following instruct...,"{1, 4, 6}",Unacceptable,index_selection_one_cat_a_time_json,0,False,10,"After reviewing the texts, I found the following lines to be grammatically unacceptable:\n\n{'unacceptable': [4, 6, 10]}\n\nHere's a brief explana..."


### Make prompts

In [9]:
clf_task = "CoLA"
database = read_obj_from_pickle(f"data/databases/text classification/{clf_task}.pkl")

Read object from data/databases/text classification/CoLA.pkl


In [19]:
# out = []
# num_instance = 100
# tasks = ["single_clf", "batch_clf", "index_selection_one_cat_a_time", "index_selection_all_cat_at_once"]

# taskSizes = [5, 10, 20, 50, 100]
# for propmtMode in ["zero-shot"]:
#     for task in tasks:

#         if task == "single_clf":
#             out.append(make_prompts_for_clf(database, task, "test", propmtMode))
#             continue

#         for taskSize in taskSizes:
#             out.append(make_prompts_for_clf(database, task, "test", propmtMode, taskSize, attr="acceptability", 
#                                             label_attr_converter=None, num_instance=num_instance))

# out = pd.concat(out)
# out.reset_index(drop=True, inplace=True)

In [20]:
# os.makedirs("results/text classification/", exist_ok=True)
# out.to_json(f"results/text classification/{clf_task}.json", orient="records", lines=True)                                             

In [13]:
fp = f"results/text classification/{clf_task}.json"
out = [pd.read_json(fp, lines=True)]

num_instance = 100
taskSizes = [5, 10, 20, 50, 100]
for propmtMode in ["zero-shot"]:
    for taskSize in taskSizes:
        out.append(make_prompts_for_clf(database, tasks[-1], "test", propmtMode, taskSize, attr="acceptability", 
                                        label_attr_converter=None, num_instance=num_instance))

out = pd.concat(out)
out.reset_index(drop=True, inplace=True)

os.makedirs("results/text classification/", exist_ok=True)
out.to_json(f"results/text classification/{clf_task}.json", orient="records", lines=True)                                             

In [14]:
out.task.value_counts()

single_clf                             1043
index_selection_one_cat_a_time         1000
index_selection_one_cat_a_time_json    1000
batch_clf                               500
index_selection_all_cat_at_once         500
Name: task, dtype: int64

In [15]:
out.prompt.apply(get_num_of_tokens).describe()

count    4043.000000
mean      410.453871
std       411.478913
min        40.000000
25%        57.500000
50%       233.000000
75%       668.000000
max      1397.000000
Name: prompt, dtype: float64

In [16]:
sub = out.copy()[(out.taskIndex == 1) & (out.taskSize <= 5)]
sub

Unnamed: 0,taskIndex,prompt,answer,targetLabel,task,#shot,CoT,taskSize,gpt-3.5-turbo-0125-completion,meta-llama/Llama-3-70b-chat-hf-completion,mistralai/Mixtral-8x7B-Instruct-v0.1-completion,meta-llama/Llama-3-8b-chat-hf-completion,gpt-4-turbo-2024-04-09-completion,lmsys/vicuna-13b-v1.5-completion,mistralai/Mistral-7B-Instruct-v0.2-completion
0,1,Indicate the grammatical acceptability for the following line of text. The acceptability shall be either 'Acceptable' or 'Unacceptable.'\n\nText: ...,Acceptable,,single_clf,0,False,1,Acceptable,Grammatical acceptability: Acceptable,Acceptable.,Acceptable,Acceptable,Acceptable.,"Acceptable. In this sentence, ""rode"" is the past tense of the transitive verb ""ride,"" which means to be carried by or move along on something. He..."
1043,1,Indicate the grammatical acceptabilities for each of the 5 following lines of text. The acceptability shall be either 'Acceptable' or 'Unacceptabl...,"[Acceptable, Acceptable, Unacceptable, Unacceptable, Unacceptable]",,batch_clf,0,False,5,1. Acceptable\n2. Acceptable\n3. Unacceptable\n4. Unacceptable\n5. Unacceptable,Here are the grammatical acceptabilities for each of the 5 lines of text:\n\n1. Acceptable\n2. Acceptable\n3. Unacceptable\n4. Unacceptable\n5. Un...,"1. Acceptable\n2. Acceptable\n3. Unacceptable - correct version: ""John believes that Bill is here.""\n4. Acceptable\n5. Unacceptable - correct ver...",Here are the grammatical acceptabilities for each of the 5 lines of text:\n\n1. It was believed that Bill was tardy. - Acceptable\n2. He likes coo...,1. Acceptable\n2. Acceptable\n3. Unacceptable\n4. Unacceptable\n5. Unacceptable,1. Acceptable\n2. Acceptable\n3. Unacceptable\n4. Unacceptable\n5. Acceptable,"1. Acceptable.\n2. Acceptable, but it would be more common to say ""He likes cookies, but he hates crumb cake.""\n3. Unacceptable. It should be ""Jo..."
1543,1,Go over the 5 lines of text below and list the index numbers of the lines that are grammatically acceptable according to the following instruction...,"[1, 2]",Acceptable,index_selection_one_cat_a_time,0,False,5,None.,1\n2,1\n2\n5,1\n4,1\n2,All.\n\n1. It was believed that Bill was tardy.\n2. He likes cookies and he hates crumb cake.\n3. John believes it that Bill is here.\n4. Chris h...,1. It was believed that Bill was tardy.\n2. He likes cookies and he hates crumb cake.\n3. John believes that Bill is here.\n5. Jack is the person...
1544,1,Go over the 5 lines of text below and list the index numbers of the lines that are grammatically unacceptable according to the following instructi...,"[3, 4, 5]",Unacceptable,index_selection_one_cat_a_time,0,False,5,3\n4\n5,3\n4\n5,The grammatically unacceptable lines are:\n3. John believes it that Bill is here.\n5. Jack is the person with whom Jenny fell in love with.\n\nSo...,1\n5,3\n4\n5,None.,"None.\n\nExplanation:\n\n1. ""It was believed that Bill was tardy"" is grammatically acceptable.\n2. ""He likes cookies and he hates crumb cake"" is ..."
2543,1,"Go over the 5 lines of text below. First, list the index numbers of the lines that are grammatically acceptable. Then, list the index numbers of t...","{'acceptable': [1, 2], 'unacceptable': [3, 4, 5]}",,index_selection_all_cat_at_once,0,False,5,"{\n ""acceptable"": [1, 2],\n ""unacceptable"": [3, 4, 5]\n}","Here is the output:\n\n{'acceptable': [1, 2], 'unacceptable': [3, 4, 5]}\n\nExplanation:\n\n* Line 1: ""It was believed that Bill was tardy."" is gr...","{\n ""acceptable"": [1, 2, 5],\n ""unacceptable"": [3, 4]\n}","After analyzing the text, I found the following results:\n\n{\n""acceptable"": [1, 2], \n""unacceptable"": [3, 4, 5]\n}\n\nHere's a brief explanation ...","```json\n{\n ""acceptable"": [1, 2, 5],\n ""unacceptable"": [3, 4]\n}\n```","{\n""acceptable"": [1, 2, 4],\n""unacceptable"": [3, 5]\n}","{""acceptable"": [0, 1, 2, 5], ""unacceptable"": [4]}\n\nExplanation:\nLines 0, 1, and 2 are grammatically acceptable.\nLine 4 is ungrammatical becau..."
3043,1,Go over the 5 lines of text below and list the index numbers of the lines that are grammatically acceptable according to the following instruction...,"{1, 2}",Acceptable,index_selection_one_cat_a_time_json,0,False,5,,,,,,,
3044,1,Go over the 5 lines of text below and list the index numbers of the lines that are grammatically unacceptable according to the following instructi...,"{3, 4, 5}",Unacceptable,index_selection_one_cat_a_time_json,0,False,5,,,,,,,


In [17]:
out["lmsys/vicuna-13b-v1.5-completion"].str.contains("TOO_LONG_SKIP").sum()

0