In [39]:
from datasets import load_dataset
import dspy
import openai
import os
import re
import pandas as pd
import json
import random
from dotenv import load_dotenv
import glob

In [40]:
load_dotenv()

True

In [3]:
lm = dspy.LM('openrouter/anthropic/claude-3.5-sonnet', temperature=0, max_tokens=1024)
dspy.configure(lm=lm)

In [4]:
ds = pd.read_json('../preprocessing/train_dev_test_data/dialogue/test.json')
ds = ds.to_dict('records')


In [5]:
samples_with_contradiction = []
samples_no_contradiction = []
for i, x in enumerate(ds):
    if x["is_contradiction"]:
        samples_with_contradiction.append((i, x))
    else:
        samples_no_contradiction.append((i, x))
print(len(samples_with_contradiction), len(samples_no_contradiction))

2108 2108


In [6]:
samples = samples_with_contradiction + samples_no_contradiction
random.shuffle(samples)

In [7]:
for sample in samples:
    dialogue = sample[1]['dialogue']
    for i in range(len(dialogue)):
        dialogue[i] = 'agent ' + str(i%2) + ': ' + dialogue[i]


In [8]:
label_map = {'is_contradiction': 1, 'no_contradiction': 0}

In [9]:
def remove_space(text):
    """Clean up spacing and formatting in dialogue text."""
    lines = text.split('\n')
    cleaned_lines = []
    
    for line in lines:
        # Remove multiple spaces
        cleaned = ' '.join(line.split())
        
        # Fix spacing around punctuation
        cleaned = re.sub(r'\s+([.,!?])', r'\1', cleaned)
        cleaned = re.sub(r'([.,!?])\s+', r'\1 ', cleaned)
        
        # Fix contractions
        cleaned = re.sub(r'\s*\'\s*s\b', "'s", cleaned)
        cleaned = re.sub(r'\s*n\s*\'\s*t\b', "n't", cleaned)
        cleaned = re.sub(r'\s*\'\s*ve\b', "'ve", cleaned)
        cleaned = re.sub(r'\s*\'\s*re\b', "'re", cleaned)
        cleaned = re.sub(r'\s*\'\s*ll\b', "'ll", cleaned)
        cleaned = re.sub(r'\s*\'\s*d\b', "'d", cleaned)
        cleaned = re.sub(r'\s*\'\s*m\b', "'m", cleaned)
        
        # Fix spaces around parentheses
        cleaned = re.sub(r'\(\s+', '(', cleaned)
        cleaned = re.sub(r'\s+\)', ')', cleaned)
        
        # Remove leading/trailing whitespace
        cleaned = cleaned.strip()
        
        cleaned_lines.append(cleaned)
        
    return '\n'.join(cleaned_lines)


In [10]:
examples = [
    dspy.Example({ 
                  "dialogue" : remove_space('\n'.join(r["dialogue"])), 
                  "label": label_map[r['label']]
                }
                  ).with_inputs("dialogue") 
    for i,r in samples
    ]

In [11]:
example = examples[59]
for k, v in example.items():
    print(f"\n{k.upper()}:\n")
    print(v)



DIALOGUE:

agent 0: Bowling is considered a "throwing sport".
agent 1: I like bowling but I don't go bowling often. What about you?
agent 0: I do. I get turkeys a lot. That's three consecutive strikes.
agent 1: I never heard of this term. What else do you know about bowling?
agent 0: Hambone is when you get four consecutive strikes. I've never gotten that.
agent 1: Oh that sounds pretty professional, I don't think I'd be that good at it.
agent 0: Yeah, there are alot of unknown games that people don't know.
agent 1: I am part of a bowling league but I've never heard of those terms.

LABEL:

1


In [12]:
def extract_prediction(text):
    matches = re.findall(r'\b[0-2]\b', text)
    # print(matches)
    parsed_answer = matches[-1] if matches else ""
    return parsed_answer

In [13]:
def eval_metric(true, prediction, trace=None):
    pred = prediction.label
    # Find all occurrences of 0 or 1 that are bounded by word boundaries (\b)
    # \b ensures we only match standalone 0s and 1s, not numbers like 10 or 01
    # [0-1] matches either 0 or 1
    matches = re.findall(r'\b[0-2]\b', pred)
    # print(matches)
    parsed_answer = matches[-1] if matches else ""
    
    # print(parsed_answer)
    return parsed_answer == str(true.label)

# Evaluate the original test set

In [14]:
from dspy.evaluate import Evaluate

In [34]:
class Dialogue(dspy.Signature):
    """Does the last utterance contradict the dialogue context? Answer with 1 if contradict, 0 if not contradict"""
    dialogue = dspy.InputField()
    label = dspy.OutputField(prefix = 'Answer:')

In [17]:
class SimpleDialogue(dspy.Module):
    def __init__(self):
        super().__init__()
        self.prog = dspy.Predict(Dialogue)

    def forward(self, dialogue):

        return self.prog(dialogue = dialogue)


In [18]:
simple_dialogue = SimpleDialogue()

In [19]:
pred = simple_dialogue(dialogue = example.dialogue)
print("\nDIALOGUE:\n")
print(example.dialogue)

print("\nANSWER:\n")
print(example.label)

print("\nPREDICTION:\n")
print(pred)



DIALOGUE:

agent 0: I went to a wedding. I saw many of my friends from my grade school past.
agent 1: Wow, you remember your friends from grade school? Maybe if I saw them again those memories would come back.
agent 0: Yes I do. Some of them I am still very fond of.

ANSWER:

0

PREDICTION:

Prediction(
    label='0'
)


In [20]:
eval_metric(example, pred)

True

In [21]:
evaluate = Evaluate(devset= examples, metric=eval_metric, num_threads=6, display_progress=True, display_table=10, return_outputs= True, return_all_scores=True)
results = evaluate(simple_dialogue)


Average Metric: 636 / 696  (91.4):  17%|█▋        | 696/4216 [01:36<06:01,  9.73it/s][2m2024-12-02T11:14:00.655050Z[0m [[31m[1merror    [0m] [1mError for example in dev set: 		 litellm.BadRequestError: Together_aiException - Error code: 400 - {'error': {'message': 'meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo is not supported for JSON mode/function calling', 'type': 'invalid_request_error', 'param': None, 'code': 'constraints_model'}}. Set `provide_traceback=True` to see the stack trace.[0m [[0m[1m[34mdspy.evaluate.evaluate[0m][0m [36mfilename[0m=[35mevaluate.py[0m [36mlineno[0m=[35m200[0m
Average Metric: 637.0 / 698  (91.3):  17%|█▋        | 698/4216 [01:36<05:34, 10.52it/s]


[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm.set_verbose=True'.


[1;31mProvider List: https://docs.litellm.ai/docs/providers[0m



Average Metric: 2010.0 / 2200  (91.4):  52%|█████▏    | 2200/4216 [04:26<04:14,  7.91it/s][2m2024-12-02T11:16:51.885682Z[0m [[31m[1merror    [0m] [1mError for example in dev set: 		 litellm.BadRequestError: Together_aiException - Error code: 400 - {'error': {'message': 'meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo is not supported for JSON mode/function calling', 'type': 'invalid_request_error', 'param': None, 'code': 'constraints_model'}}. Set `provide_traceback=True` to see the stack trace.[0m [[0m[1m[34mdspy.evaluate.evaluate[0m][0m [36mfilename[0m=[35mevaluate.py[0m [36mlineno[0m=[35m200[0m
Average Metric: 2011.0 / 2202  (91.3):  52%|█████▏    | 2202/4216 [04:27<05:07,  6.56it/s]


[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm.set_verbose=True'.


[1;31mProvider List: https://docs.litellm.ai/docs/providers[0m



Average Metric: 3865.0 / 4216  (91.7): 100%|██████████| 4216/4216 [09:10<00:00,  7.66it/s]


Unnamed: 0,dialogue,example_label,pred_label,eval_metric,label
0,agent 0: I really enjoy reading biographies alot. I like to learn about people and their lives. agent 1: I agree. How do you read?...,0.0,0,✔️ [True],
1,"agent 0: Yesterday, I forgot to flush the toilet at work. The look of disgust on my co-worker's face who just walked out was priceless....",0.0,0,✔️ [True],
2,agent 0: i grew up in wisconsin. i live in texas. agent 1: i used to know some people in dallas. are you near there?...,0.0,0,✔️ [True],
3,"agent 0: good afternoon, where are you coming from today? agent 1: hi there. i am coming from my snack shop in new york. you?...",1.0,1,✔️ [True],
4,"agent 0: One of the saddest things to me is when people underestimate what they can do and/or are capable of. agent 1: I know,...",1.0,1,✔️ [True],
5,agent 0: My manager keep lying to me. He is trying to advantage of me. agent 1: Do you want to yell at him and...,0.0,0,✔️ [True],
6,"agent 0: I've been married almost 23 years, but many years ago, my husband was acting a bit too ""chivalrous"" to a female acquaintance of...",0.0,0,✔️ [True],
7,"agent 0: What are some of John Grisham's movies? agent 1: I'm not quite sure but to my knowledge John Grisham was a writer, he...",1.0,1,✔️ [True],
8,agent 0: Green is a color between blue and yellow agent 1: OH YES I ALSO LIKE IT agent 0: It is evoked by light...,1.0,1,✔️ [True],
9,"agent 0: hello, I know I am not the only one, i love dogs. agent 1: yea me too i like the godendoodle which is...",0.0,1,,


In [23]:
items = []
for sample in results[1]:
    item = {}
    print(sample)
    dialog = sample[0]['dialogue']
    label = sample[0]['label']
    if sample[1] != {}:
        pred = sample[1]['label']
    else:
        pred = 'None'
    item['dialog'] = dialog
    item['label'] = label
    item['pred'] = pred
    items.append(item)
df_result = pd.DataFrame(data = items)
df_result.to_csv('results/dialogue/llama-0shot-dialogue.csv')

(Example({'dialogue': "agent 0: I really enjoy reading biographies alot. I like to learn about people and their lives.\nagent 1: I agree. How do you read? You can access libraries online such as Cardiff Central Library's mobile library service.\nagent 0: I usually read on my Kindle. You?\nagent 1: Yep I use Kindle too. Love ebooks! What's your favorite food?\nagent 0: Yes, ebooks are so handy! A great way to get my Stephen King Fix. As for food, I love Pad Thai\nagent 1: Pad thai is good, but how about pizza? I love mushroom and sausage\nagent 0: Ooh, you have good taste. I love mushroom on pizza, but no sausage. I'm a vegetarian\nagent 1: I enjoy a lot of great vegetarian recipes. do you have any favorites?\nagent 0: One thing I love is to blend cashew nuts as a parmesan substitute. It's amazing, you've got to try it!\nagent 1: Sounds like a great idea. Ever used nutritional yeast?", 'label': 0}) (input_keys={'dialogue'}), Prediction(
    label='0'
), True)
(Example({'dialogue': "agen

In [293]:
class CoTDialogue(dspy.Module):
    def __init__(self):
        super().__init__()
        self.prog = dspy.ChainOfThought(Dialogue)

    def forward(self, dialogue):

        return self.prog(dialogue = dialogue)


In [294]:
cot_dialogue = CoTDialogue()
pred = cot_dialogue(dialogue = example.dialogue)
print("\nDIALOGUE:\n")
print(example.dialogue)
print("\nANSWER:\n")
print(example.label)
print("\nPREDICTION:\n")
print(pred)



DIALOGUE:

agent 0: i like playing games. how about you?
agent 1: some times i love the color green though
agent 0: I really enjoy eating junk food while playing games
agent 1: i likr to travel with my cat
agent 0: I like traveling in games, heard there's a new game about cats.
agent 1: ok good. cats can see in near darkness. I scared about dark so my pet is useful for me
agent 0: I sometimes get scared in games where it's really dark and mysterious
agent 1: the new one that just came out. lets play tomorrow?
agent 0: Sure! I'd love to try out the new game where the player is a cat
agent 1: good

ANSWER:

0

PREDICTION:

Prediction(
    reasoning='The last utterance "good" from agent 1 does not contradict the dialogue context. Throughout the conversation, both agents discuss their interests in games, cats, and the new game where the player is a cat. Agent 1\'s response "good" is an acknowledgment of agent 0\'s willingness to try the new game, which aligns with the context of the conve

In [295]:
evaluate = Evaluate(devset= examples, metric=eval_metric, num_threads=6, display_progress=True, display_table=10, return_outputs= True, return_all_scores=True)
results = evaluate(cot_dialogue)


Average Metric: 3833 / 4216  (90.9): 100%|██████████| 4216/4216 [12:14<00:00,  5.74it/s]


Unnamed: 0,dialogue,example_label,reasoning,pred_label,eval_metric
0,agent 0: Yes I just can't help myself I love to sing and bathrooms have great acoustics agent 1: truth be told the multiple reflections...,1,The last utterance by agent 0 contradicts their previous statement where they mentioned being fine in the choir but not being a soloist. In the...,1,✔️ [True]
1,"agent 0: I love American football also know as gridiron. agent 1: It's a pretty fun sport to watch, also I have never heard it...",1,"The last utterance by agent 0, ""I've never heard of the gridiron,"" contradicts their earlier statement where they expressed love for American football, also known...",1,✔️ [True]
2,agent 0: I was mad when I got fired last year. I did nothing wrong agent 1: I feel sorry for you. Did you get...,1,The last utterance by agent 1 states that they spend all their time traveling and don't have time to work a real job. This does...,0,
3,"agent 0: I am a really picky eater, and now my favorite restaraunt is ruined for me. agent 1: Why? What happened? I hope you...",1,"The last utterance by agent 1 contradicts the dialogue context. Initially, agent 1 acknowledges the issue of a hair in the burger and suggests informing...",1,✔️ [True]
4,"agent 0: My wife and I will always be married to each other. agent 1: Oh, that does not seem very nice. agent 0: Why...",1,"The last utterance by agent 1 contradicts their earlier statement. Initially, agent 1 mentioned that they were in TV commercials when they were young, which...",1,✔️ [True]
5,agent 0: Someone suggested to me that I would enjoy John Grisham books. Can you tell me a little about him? agent 1: John Ray...,1,"The last utterance by agent 1 states that John Grisham's works don't seem to be too popular, which contradicts the earlier information provided that his...",1,✔️ [True]
6,"agent 0: near abigail, she is my best friend ever!!! agent 1: ah, that is cute. do you live in the town or countryside? agent...",0,"The last utterance by agent 0 does not contradict the dialogue context. Throughout the conversation, agent 0 has mentioned activities they do with their friend...",0,✔️ [True]
7,"agent 0: hi, how are you doing? agent 1: hello! i'm well, and you? agent 0: pretty good. i love playing board games, do you?...",1,"The last utterance by agent 0 states that they work in a high school, which contradicts their earlier statement about being an elementary teacher who...",1,✔️ [True]
8,"agent 0: I met my wife at a Nickleback concert. Do you know that Canadian rock band? agent 1: I've heard of them, but I...",1,"The dialogue context indicates that Nickelback released an album in 2001 called ""Silver Side Up."" However, the last utterance by agent 0 states that the...",1,✔️ [True]
9,"agent 0: I love to fish but i've never caught cod agent 1: Hey, well I actually grew up in an area and have fished...",0,"The dialogue context involves a discussion about fishing for cod, specifically focusing on the Atlantic cod. Agent 1 mentions that cod can be cured in...",0,✔️ [True]


In [152]:
items = []
for sample in results[1]:
    item = {}
    print(sample[0])
    dialog = sample[0]['dialogue']
    label = sample[0]['label']
    pred = sample[1]['label']
    rationale = sample[1]['reasoning']
    item['dialog'] = dialog
    item['label'] = label
    item['pred'] = pred
    item['reasoning'] = rationale
    items.append(item)
df_result = pd.DataFrame(data = items)
df_result.to_csv('results/dialogue/llama-0shot-cot-dialogue.csv')

Example({'dialogue': 'agent 0: My dream is to be a famous actress\nagent 1: Really? you should attend a Drama school where they train you in drama and theatre arts, such as acting.\nagent 0: What school do you suggest?\nagent 1: There are many you should research on the internet.  Entry is usually through a competitive audition process though\nagent 0: Do you know the name of a topmost drama school?\nagent 1: I don;t remember.  But audition usually involve the performance of monologues and group workshops, so you should be prepared for that.\nagent 0: What type of drama is performed in the school?', 'label': 0}) (input_keys={'dialogue'})
Example({'dialogue': "agent 0: hi. i'm an environmentalist from ohio. how are you?\nagent 1: i'm good i live in a rural farm area\nagent 0: do you use any alternative energy?\nagent 1: no is that how you make a living?\nagent 0: no. i just put solar panels on my roof, though. i work in social media.\nagent 1: can you save money that way?\nagent 0: Of c

In [15]:
def append_person(ds):
    for i,sample in enumerate(ds):
        # print(sample)
        modified_dialog = sample['dialog_context'] + [sample['modified_text']]
        original_dialog = sample['dialog_context'] + [sample['original_text']]
        for j, turn in enumerate(modified_dialog):
            turn = 'agent ' + str(j%2) + ': ' + turn
            modified_dialog[j] = turn
        for j, turn in enumerate(original_dialog):
            turn = 'agent ' + str(j%2) + ': ' + turn
            original_dialog[j] = turn
        ds[i]['original_dialog'] = '\n'.join(original_dialog)
        ds[i]['original_dialog'] = remove_space(ds[i]['original_dialog'])
        ds[i]['modified_dialog'] = '\n'.join(modified_dialog)
        ds[i]['modified_dialog'] = remove_space(ds[i]['modified_dialog'])

    return ds


# Evaluate by modification

## Without label change

In [16]:
def evaluate_modified_set(ds, program):
    examples = [
    dspy.Example({ 
                  "dialogue" : r['modified_dialog'], 
                  "original_dialogue": r['original_dialog'],
                  "label": int(r['label']),
                  "modified_label": int(r['label'])
                }
                  ).with_inputs("dialogue") 
    for r in ds
    ]
    evaluate = Evaluate(devset= examples, metric=eval_metric, num_threads=6, display_progress=True, display_table=1, return_outputs= True, return_all_scores=True)
    results = evaluate(program)
    return results

In [17]:
class Dialogue(dspy.Signature):
    """Does the last utterance contradict the dialogue context? Answer with 1 if contradict, 0 if not contradict"""
    dialogue = dspy.InputField()
    label = dspy.OutputField(prefix = 'Answer:')

In [18]:
class SimpleDialogue(dspy.Module):
    def __init__(self):
        super().__init__()
        self.prog = dspy.Predict(Dialogue)

    def forward(self, dialogue):

        return self.prog(dialogue = dialogue)
simple_dialogue = SimpleDialogue()

In [19]:
# Get all json files in the specified directory
json_files = glob.glob('../data/modified_data/dialogue/*_100.json')
original_pred_ds = pd.read_csv('results/dialogue/claude-3-5-sonnet-0shot-dialogue.csv', index_col=False)
original_pred_ds['dialog'] = original_pred_ds['dialog'].apply(remove_space)  # Replace 'your_function' with the actual function
# print(original_pred_ds['dialog'][1958])
for json_file in json_files:
    # if any(x in json_file for x in ['grammatical_role', 'negation']):
    #     continue
 
    # Load the json file
    print(json_file)
    # with open(json_file, 'r') as f:
    #     data = json.load(f)
    if not any(x in json_file for x in ['capitalization', 'typo_bias', 'punctuation', 'grammatical_role', 'negation']):
        data = pd.read_json(json_file)[1]
        data = list(data)
    else:
        with open(json_file,'r') as f:
            data = json.load(f)
        # data = pd.read_json(json_file)
        # data = data.to_json(orient = 'records')
        # data = ast.literal_eval(data)
    # print(data)
    data = append_person(data)
    results_modified = evaluate_modified_set(data, simple_dialogue)
    items = []
    for sample in results_modified[1]:
        item = {}
        # print(sample[0])
        modified_dialog = sample[0]['dialogue']
        original_dialog = sample[0]['original_dialogue']

        label = sample[0]['label']
        pred = sample[1]['label']
        # rationale = sample[1]['reasoning']
        # original_pred = compare_dialog(original_pred_ds, original_pred_ds['dialog'], original_dialog)
        original_dialog = remove_space(original_dialog)
        print(original_dialog)
        # print()
        original_pred = original_pred_ds.loc[original_pred_ds['dialog'] == original_dialog]['pred'].values[0]
        item['original_dialog'] = original_dialog
        item['modified_dialog'] = modified_dialog
        modified_pred = extract_prediction(pred)
        item['modified_label'] = label
        item['original_label'] = label
        item['modified_pred'] = modified_pred
        item['original_pred'] = original_pred
        # item['reasoning'] = sample[1]['reasoning']
        # item['reasoning'] = rationale
        items.append(item)
    
    df_result = pd.DataFrame(data=items)
    
    # Save results with filename based on input json
    output_filename = f"results/dialogue/claude-3-5-sonnet-0shot-{json_file.split('/')[-1].replace('.json', '')}.csv"
    df_result.to_csv(output_filename)


../preprocessing/data_after_phase2/rongxin/casual_100.json




KeyboardInterrupt: 

## With label change

In [41]:
def evaluate_modified_set(ds, program):
    examples = [
    dspy.Example({ 
                  "dialogue" : r['modified_dialog'], 
                  "original_dialogue": r['original_dialog'],
                  "label": int(r['modified_label']) if r.get('modified_label') != None else int(r['label']),
                  "original_label": int(r['label']),
                  "index": r['index'],
                  "type": r['type'] if r.get('type') != None else ''
                }
                  ).with_inputs("dialogue") 
    for r in ds
    ]
    evaluate = Evaluate(devset= examples, metric=eval_metric, num_threads=1, display_progress=True, display_table=1, return_outputs= True, return_all_scores=True)
    results = evaluate(program)
    return results

In [42]:
class Dialogue(dspy.Signature):
    """Does the last utterance contradict the dialogue context? Answer with 1 if contradict, 0 if not contradict"""
    dialogue = dspy.InputField()
    label = dspy.OutputField(prefix = 'Answer:')

In [43]:
class SimpleDialogue(dspy.Module):
    def __init__(self):
        super().__init__()
        self.prog = dspy.Predict(Dialogue)

    def forward(self, dialogue):

        return self.prog(dialogue = dialogue)
simple_dialogue = SimpleDialogue()

In [36]:
import difflib


In [44]:


# Get all json files in the specified directory
json_files = glob.glob('../data/modified_data/dialogue/*_100.json')
original_pred_ds = pd.read_csv('results/dialogue/claude-3-5-sonnet-0shot-dialogue.csv', index_col=False)
original_pred_ds['dialog'] = original_pred_ds['dialog'].apply(remove_space)  # Replace 'your_function' with the actual function
# print(original_pred_ds['dialog'][1958])
for json_file in json_files:
    # if not any(x in json_file for x in ['negation']):
    #     continue
    if not any(x in json_file for x in ['geographical_bias']):
        continue
    # Load the json file
    # print(json_file)
    # with open(json_file, 'r') as f:
    #     data = json.load(f)
    # if any(x in json_file for x in ['capitalization', 'typo_bias', 'punctuation', 'grammatical_role', 'derivation', 'negation']):
    with open(json_file, 'r') as f:
        raw_data = json.load(f)
    data = [dict(sample, index=idx) for idx, sample in raw_data]
        # print(data[0])
        # break
        # data = pd.read_json(json_file)
        # data = data.to_json(orient = 'records')
        # data = ast.literal_eval(data)
    # print(data)
    data = append_person(data)
    results_modified = evaluate_modified_set(data, simple_dialogue)
    items = []
    for sample in results_modified[1]:
        item = {}
        # print(sample[0])
        modified_dialog = sample[0]['dialogue']
        original_dialog = sample[0]['original_dialogue']

        pred = sample[1]['label']
        # rationale = sample[1]['reasoning']
        # original_pred = compare_dialog(original_pred_ds, original_pred_ds['dialog'], original_dialog)
        original_dialog = remove_space(original_dialog)
        # print(original_dialog)
        # print()
        original_pred = original_pred_ds[original_pred_ds['index'] == sample[0]['index']]['pred'].iloc[0]
        item['original_dialog'] = original_dialog
        item['modified_dialog'] = modified_dialog
        modified_pred = extract_prediction(pred)
        item['modified_label'] = sample[0]['label']
        item['original_label'] = sample[0]['original_label']
        item['modified_pred'] = modified_pred
        item['original_pred'] = original_pred
        item['type'] = sample[0]['type']
        # item['reasoning'] = sample[1]['reasoning']
        # item['reasoning'] = rationale
        items.append(item)
    
    df_result = pd.DataFrame(data=items)
    
    # Save results with filename based on input json
    output_filename = f"results/dialogue/claude-3-5-sonnet-0shot-{json_file.split('/')[-1].replace('.json', '')}.csv"
    df_result.to_csv(output_filename)


Average Metric: 83 / 92  (90.2): 100%|██████████| 92/92 [00:00<00:00, 996.45it/s] 


Unnamed: 0,dialogue,original_dialogue,example_label,original_label,index,type,pred_label,eval_metric
0,agent 0: Cardigans make me look dapper. agent 1: I agree. They're associated with the Roaring Twenties which had many classy dressing styles. agent 0:...,agent 0: Cardigans make me look dapper. agent 1: I agree. They're associated with the Roaring Twenties which had many classy dressing styles. agent 0:...,1,1,3408,geographical_bias,1,✔️ [True]


# Aggregate results

In [97]:
import glob
from scipy import stats
import pandas as pd

In [98]:
result_files = glob.glob('results/dialogue/claude-3-5-sonnet-0shot-*_100.csv')

aggregated_results = []

for file in result_files:
    # Extract modification type from filename
    mod_type = file.split('-')[-1].replace('.csv','')
    
    # Read results file
    df = pd.read_csv(file)

    # Calculate accuracies
    original_correct = (df['original_pred'] == df['original_label']).sum()
    modified_correct = (df['modified_pred'] == df['modified_label']).sum()
    total = len(df)

    original_acc = original_correct / total
    modified_acc = modified_correct / total
    
    # Calculate the difference between original_res and modified_res
    difference = -round(original_acc - modified_acc, 2)
    
    # Calculate percentage difference with respect to total samples
    pct_difference = -round((original_correct - modified_correct) / original_correct * 100, 2)
    
    # Perform t-test between original and modified predictions
    t_stat, p_value = stats.ttest_ind(
        (df['original_pred'] == df['original_label']).astype(float),
        (df['modified_pred'] == df['modified_label']).astype(float)
    )
    
    aggregated_results.append({
        'task': 'dialogue_contradiction_detection',
        'modification': mod_type,
        'original_res': round(original_acc, 2),
        'modified_res': round(modified_acc, 2),
        'difference': difference,  # Difference in accuracy
        'pct_difference': pct_difference,  # Percentage difference relative to total samples
        'p_value': p_value  # Add p-value from t-test
    })

# Create final results dataframe
results_df = pd.DataFrame(aggregated_results)

# Sort the results based on modification_name
modification_name = ['temporal_bias_100', 'geographical_bias_100','length_bias_100', 'typo_bias_100', 'capitalization_100', 'punctuation_100', 'derivation_100', 'compound_word_100','active_to_passive_100','grammatical_role_100', 'coordinating_conjunction_100', 'concept_replacement_100','negation_100','discourse_100','sentiment_100','casual_100', 'dialectal_100']
results_df['modification'] = pd.Categorical(results_df['modification'], categories=modification_name, ordered=True)
results_df = results_df.sort_values(by='modification')

# Calculate averages across all modifications
avg_original = results_df['original_res'].mean()
avg_modified = results_df['modified_res'].mean()
avg_difference = avg_original - avg_modified
avg_pct_difference = results_df['pct_difference'].mean()

# Add averages as a new row
results_df.loc[len(results_df)] = {
    'task': 'dialogue_contradiction_detection',
    'modification': 'average',
    'original_res': round(avg_original, 2),
    'modified_res': round(avg_modified, 2),
    'difference': -round(avg_difference, 2),
    'pct_difference': round(avg_pct_difference, 2),
    'p_value': None  # No p-value for average row
}

print("\n")
results_df.to_csv('results/dialogue/claude-3-5-sonnet-DP.csv')

# Apply styling to highlight rows where original_res > modified_res and significant p-values
def highlight_drops_and_significance(row):
    colors = [''] * len(row)
    if row['original_res'] > row['modified_res']:
        colors = ['background-color: red'] * len(row)
        # If p-value < 0.05, add bold text
        if 'p_value' in row and row['p_value'] is not None and row['p_value'] < 0.05:
            colors = ['background-color: red; font-weight: bold'] * len(row)
    return colors

results_df.round(2).style.apply(highlight_drops_and_significance, axis=1)






  results_df.loc[len(results_df)] = {


Unnamed: 0,task,modification,original_res,modified_res,difference,pct_difference,p_value
9,dialogue_contradiction_detection,temporal_bias_100,0.9,0.95,0.04,4.71,0.27
0,dialogue_contradiction_detection,geographical_bias_100,0.89,0.82,-0.07,-7.89,0.19
8,dialogue_contradiction_detection,length_bias_100,0.92,0.96,0.04,4.6,0.24
10,dialogue_contradiction_detection,typo_bias_100,0.97,0.96,-0.01,-1.1,0.7
4,dialogue_contradiction_detection,capitalization_100,0.97,0.98,0.01,1.19,0.65
7,dialogue_contradiction_detection,punctuation_100,0.97,0.96,-0.01,-1.09,0.7
16,dialogue_contradiction_detection,derivation_100,0.95,0.98,0.03,3.41,0.25
14,dialogue_contradiction_detection,compound_word_100,0.94,0.95,0.01,1.1,0.76
3,dialogue_contradiction_detection,active_to_passive_100,0.95,0.92,-0.02,-2.27,0.55
6,dialogue_contradiction_detection,grammatical_role_100,0.96,0.94,-0.01,-1.49,0.7


In [100]:
# Filter for negation modification only
# negation_results = pd.read_csv('results/dialogue/claude-3-5-sonnet-DP.csv')
# negation_df = negation_results[negation_results['modification'] == 'negation_100']

# Load the detailed results for negation
negation_detailed = pd.read_csv('results/dialogue/claude-3-5-sonnet-0shot-negation_100.csv')

# Calculate accuracy for original and modified examples by type
type_results = []
for type_name in negation_detailed['type'].unique():
    type_data = negation_detailed[negation_detailed['type'] == type_name]
    
    # Calculate original and modified accuracies
    original_acc = (type_data['original_pred'] == type_data['original_label']).sum() / len(type_data)
    modified_acc = (type_data['modified_pred'] == type_data['modified_label']).sum() / len(type_data)
    
    # Calculate differences
    difference = -(original_acc - modified_acc)
    pct_difference = -round((original_acc - modified_acc) / original_acc * 100, 2)

    # Perform t-test
    original_correct = (type_data['original_pred'] == type_data['original_label']).astype(int)
    modified_correct = (type_data['modified_pred'] == type_data['modified_label']).astype(int)
    _, p_value = stats.ttest_rel(original_correct, modified_correct)
    
    type_results.append({
        'type': type_name,
        'num_samples': len(type_data),
        'original_res': round(original_acc * 100, 2),
        'modified_res': round(modified_acc * 100, 2),
        'difference': round(difference * 100, 2),
        'pct_difference': round(pct_difference, 2),
        'p_value': p_value
    })

# Create and display type-based results dataframe
type_results_df = pd.DataFrame(type_results)

# Calculate averages
avg_original = type_results_df['original_res'].mean()
avg_modified = type_results_df['modified_res'].mean()
avg_difference = avg_original - avg_modified
avg_pct_difference = type_results_df['pct_difference'].mean()
total_samples = type_results_df['num_samples'].sum()

# Add averages row
# type_results_df.loc[len(type_results_df)] = {
#     'type': 'average',
#     'num_samples': total_samples,
#     'original_res': round(avg_original, 2),
#     'modified_res': round(avg_modified, 2),
#     'difference': round(avg_difference, 2),
#     'pct_difference': round(avg_pct_difference, 2),
#     'p_value': None
# }

# Apply the same styling as before
styled_type_results = type_results_df.round(2).style.apply(highlight_drops_and_significance, axis=1)
styled_type_results


Unnamed: 0,type,num_samples,original_res,modified_res,difference,pct_difference,p_value
0,lexical,32,87.5,59.38,-28.12,-32.14,0.0
1,double,11,100.0,63.64,-36.36,-36.36,0.04
2,verbal,18,88.89,83.33,-5.56,-6.25,0.67
3,absolute,13,100.0,69.23,-30.77,-30.77,0.04
4,approximate,26,100.0,76.92,-23.08,-23.08,0.01


In [319]:
# Load results from different models
gpt4_df = pd.read_csv('results/dialogue/llama-0shot-dialogue.csv')
claude_df = pd.read_csv('results/dialogue/claude-3-5-sonnet-0shot-dialogue.csv')
mixtral_df = pd.read_csv('results/dialogue/mixtral-8x22b-0shot-dialogue.csv')

# Calculate accuracy between predictions and labels
gpt4_acc = (gpt4_df['pred'] == gpt4_df['label']).mean()
claude_acc = (claude_df['pred'] == claude_df['label']).mean()
mixtral_acc = (mixtral_df['pred'] == mixtral_df['label']).mean()
# Calculate average accuracy for each model
print(f"GPT-4 Average Accuracy: {gpt4_acc:.2%}")
print(f"Claude-3.5 Average Accuracy: {claude_acc:.2%}")
print(f"Mixtral Average Accuracy: {mixtral_acc:.2%}")

# Create comparison dataframe
comparison_df = pd.DataFrame({
    'Model': ['GPT-4', 'Claude-3.5', 'Mixtral'],
    'Accuracy': [gpt4_acc, claude_acc, mixtral_acc]
})

# Style the dataframe
def highlight_max(s):
    is_max = s == s.max()
    return ['background-color: green' if v else '' for v in is_max]

styled_df = comparison_df.style.apply(highlight_max, subset=['Accuracy'])
styled_df


GPT-4 Average Accuracy: 93.12%
Claude-3.5 Average Accuracy: 94.83%
Mixtral Average Accuracy: 86.12%


Unnamed: 0,Model,Accuracy
0,GPT-4,0.931214
1,Claude-3.5,0.948268
2,Mixtral,0.861243


# Llama-3-405B

In [360]:
lm = dspy.LM('together_ai/meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo')
dspy.configure(lm=lm)