In [1]:
from datasets import load_dataset
import dspy
import openai
import os
import re
import pandas as pd
import json
import random
from dotenv import load_dotenv
import glob


In [2]:
load_dotenv()



True

In [3]:
openai.api_key = os.getenv('OPENAI_API_KEY')
openai.organization = os.getenv('OPENAI_ORGANIZATION')


In [4]:
lm = dspy.LM('openai/gpt-4o', temperature=0, max_tokens=250)
dspy.configure(lm=lm)

In [6]:
# ds = pd.read_json('data/sa/test.json', lines= True)
# ds = ds.to_dict('records')
ds = load_dataset('stanfordnlp/sst2')['validation']


In [139]:
ds

Dataset({
    features: ['idx', 'sentence', 'label'],
    num_rows: 872
})

In [5]:
def remove_space(text):
    """Clean up spacing and formatting in dialogue text."""
    # print(text)
    lines = text.split('\n')
    cleaned_lines = []
    # print(lines)
    for line in lines:
        # Remove multiple spaces
        cleaned = ' '.join(line.split())
        
        # Fix spacing around punctuation
        cleaned = re.sub(r'\s+([.,!?:;])', r'\1', cleaned)
        cleaned = re.sub(r'([.,!?:;])\s+', r'\1 ', cleaned)
        
        # Fix contractions
        cleaned = re.sub(r'\s*\'\s*s\b', "'s", cleaned)
        cleaned = re.sub(r'\s*n\s*\'\s*t\b', "n't", cleaned)
        cleaned = re.sub(r'\s*\'\s*ve\b', "'ve", cleaned)
        cleaned = re.sub(r'\s*\'\s*re\b', "'re", cleaned)
        cleaned = re.sub(r'\s*\'\s*ll\b', "'ll", cleaned)
        cleaned = re.sub(r'\s*\'\s*d\b', "'d", cleaned)
        cleaned = re.sub(r'\s*\'\s*m\b', "'m", cleaned)
        
        # Fix spaces around parentheses
        cleaned = re.sub(r'\(\s+', '(', cleaned)
        cleaned = re.sub(r'\s+\)', ')', cleaned)
        
        # Remove leading/trailing whitespace
        cleaned = cleaned.strip()
        
        cleaned_lines.append(cleaned)
        
    return '\n'.join(cleaned_lines)


In [11]:
examples = [
    dspy.Example({ 
                  "text": remove_space(r["sentence"]), 
                  "label": r["label"]}
                  
                  ).with_inputs("text") 
    for r in ds
    ]


NameError: name 'ds' is not defined

In [9]:
example = examples[835]
for k, v in example.items():
    print(f"\n{k.upper()}:\n")
    print(v)



TEXT:

a giggle-inducing comedy with snappy dialogue and winning performances by an unlikely team of oscar-winners: susan sarandon and goldie hawn.

LABEL:

1


In [6]:
def extract_prediction(text):
    matches = re.findall(r'\b[0-2]\b', text)
    # print(matches)
    parsed_answer = matches[-1] if matches else ""
    return parsed_answer

In [7]:
def eval_metric(true, prediction, trace=None):
    pred = prediction.label
    matches = re.findall(r'\b[0-2]\b', pred)
    parsed_answer = matches[-1] if matches else ""
    # print(parsed_answer)
    return parsed_answer == str(true.label)

# Evaluate the original test set

In [8]:
from dspy.evaluate import Evaluate



In [13]:
class Sentiment(dspy.Signature):
    """Classify sentiment of the given text. Answer with 1 for positive, 0 for negative."""
    text = dspy.InputField()
    label = dspy.OutputField(prefix = 'Answer:')

In [192]:
class SimpleSentiment(dspy.Module):
    def __init__(self):
        super().__init__()
        self.prog = dspy.Predict(Sentiment)

    def forward(self, text):
        return self.prog(text=text)


In [193]:
simple_sentiment = SimpleSentiment()

In [150]:
pred = simple_sentiment(text=example.text)
print("\nQUESTION:\n")
print(example.text)
# print("\nANSWER:\n")
# print(example.label)
print("\nPREDICTION:\n")
print(pred)



QUESTION:

it's a charming and often affecting journey.

PREDICTION:

Prediction(
    label='1'
)


In [151]:
eval_metric(example, pred)

True

In [194]:
evaluate = Evaluate(devset= examples, metric=eval_metric, num_threads=6, display_progress=True, display_table=10, return_outputs= True, return_all_scores=True)
results = evaluate(simple_sentiment)
items = []
for sample in results[1]:
    item = {}
    sentence = sample[0]['text']
    label = sample[0]['label']
    pred = sample[1]['label']
    item['text'] = sentence
    item['label'] = label
    item['pred'] = pred
    items.append(item)
df_result = pd.DataFrame(data = items)
df_result.to_csv('results/sa/gpt4o-0shot-sst2.csv')

Average Metric: 817 / 872  (93.7): 100%|██████████| 872/872 [01:43<00:00,  8.44it/s]


Unnamed: 0,text,example_label,pred_label,eval_metric
0,it's a charming and often affecting journey.,1,1,✔️ [True]
1,unflinchingly bleak and desperate,0,0,✔️ [True]
2,allows us to hope that nolan is poised to embark a major career as a commercial yet inventive filmmaker.,1,1,✔️ [True]
3,"the acting, costumes, music, cinematography and sound are all astounding given the production's austere locales.",1,1,✔️ [True]
4,"it's slow -- very, very slow.",0,0,✔️ [True]
5,"although laced with humor and a few fanciful touches, the film is a refreshingly serious look at young women.",1,1,✔️ [True]
6,a sometimes tedious film.,0,0,✔️ [True]
7,or doing last year's taxes with your ex-wife.,0,0,✔️ [True]
8,you don't have to know about music to appreciate the film's easygoing blend of comedy and romance.,1,1,✔️ [True]
9,"in exactly 89 minutes, most of which passed as slowly as if i'd been sitting naked on an igloo, formula 51 sank from quirky to...",0,0,✔️ [True]


In [40]:
class CoTSentiment(dspy.Module):
    def __init__(self):
        super().__init__()
        self.prog = dspy.ChainOfThought(Sentiment)

    def forward(self, text):
        return self.prog(text=text)


In [107]:
cot_sentiment = CoTSentiment()
pred = cot_sentiment(text=example.text)
print("\nQUESTION:\n")
print(example.text)
# print("\nANSWER:\n")
# print(example.label)
print("\nPREDICTION:\n")
print(pred)



QUESTION:

it 's a charming and often affecting journey . 

PREDICTION:

Prediction(
    reasoning='The text describes the journey as "charming" and "often affecting," which are positive adjectives. "Charming" suggests that the journey is delightful and pleasing, while "affecting" implies that it has a significant emotional impact. Both terms indicate a positive sentiment towards the journey.',
    label='1'
)


In [108]:
lm.inspect_history()





[34m[2024-11-14T19:10:57.926980][0m

[31mSystem message:[0m

Your input fields are:
1. `text` (str)

Your output fields are:
1. `reasoning` (str)
2. `label` (str)

All interactions will be structured in the following way, with the appropriate values filled in.

[[ ## text ## ]]
{text}

[[ ## reasoning ## ]]
{reasoning}

[[ ## label ## ]]
{label}

[[ ## completed ## ]]

In adhering to this structure, your objective is: 
        Classify sentiment of the given text. Answer with 1 for positive, 0 for negative, or 2 for neutral.


[31mUser message:[0m

[[ ## text ## ]]
it 's a charming and often affecting journey .

Respond with the corresponding output fields, starting with the field `[[ ## reasoning ## ]]`, then `[[ ## label ## ]]`, and then ending with the marker for `[[ ## completed ## ]]`.


[31mResponse:[0m

[32m[[ ## reasoning ## ]]
The text describes the journey as "charming" and "often affecting," which are positive adjectives. "Charming" suggests that the journey is 

In [42]:
evaluate = Evaluate(devset= examples, metric=eval_metric, num_threads=6, display_progress=True, display_table=10, return_outputs= True, return_all_scores=True)
results = evaluate(cot_sentiment)
items = []
for sample in results[1]:
    item = {}
    sentence = sample[0]['text']
    label = sample[0]['label']
    pred = sample[1]['label']
    item['text'] = sentence
    item['label'] = label
    item['pred'] = pred
    items.append(item)
df_result = pd.DataFrame(data = items)
df_result.to_csv('gpt4o-0shot-cot-sst2.csv')

Average Metric: 760 / 872  (87.2): 100%|██████████| 872/872 [05:35<00:00,  2.60it/s]


Unnamed: 0,text,example_label,reasoning,pred_label,eval_metric
0,it 's a charming and often affecting journey .,1,"The text describes the journey as ""charming"" and ""often affecting,"" which are positive adjectives. ""Charming"" suggests that the journey is delightful and pleasing, while ""affecting""...",1,✔️ [True]
1,unflinchingly bleak and desperate,0,"The text ""unflinchingly bleak and desperate"" uses words that convey a strong sense of negativity. ""Bleak"" suggests a lack of hope or positivity, and ""desperate""...",0,✔️ [True]
2,allows us to hope that nolan is poised to embark a major career as a commercial yet inventive filmmaker .,1,"The text expresses optimism and hope regarding Nolan's future career, suggesting that he is on the verge of achieving significant success as a filmmaker who...",1,✔️ [True]
3,"the acting , costumes , music , cinematography and sound are all astounding given the production 's austere locales .",1,"The text praises multiple aspects of the production, such as acting, costumes, music, cinematography, and sound, describing them as ""astounding."" This indicates a strong positive...",1,✔️ [True]
4,"it 's slow -- very , very slow .",0,"The text describes something as ""slow"" and emphasizes this by repeating ""very, very slow."" This repetition suggests a negative sentiment, as slowness is typically not...",0,✔️ [True]
5,"although laced with humor and a few fanciful touches , the film is a refreshingly serious look at young women .",1,"The text describes the film as ""refreshingly serious"" and mentions it is ""laced with humor and a few fanciful touches,"" which suggests a positive sentiment....",1,✔️ [True]
6,a sometimes tedious film .,0,"The text describes the film as ""sometimes tedious,"" which suggests a negative sentiment. The word ""tedious"" implies that the film can be boring or monotonous...",0,✔️ [True]
7,or doing last year 's taxes with your ex-wife .,0,"The text refers to the task of doing taxes with an ex-wife, which is likely to be an uncomfortable or unpleasant situation due to the...",0,✔️ [True]
8,you do n't have to know about music to appreciate the film 's easygoing blend of comedy and romance .,1,"The text suggests that the film is enjoyable and accessible even to those who are not knowledgeable about music. The use of the words ""easygoing...",1,✔️ [True]
9,"in exactly 89 minutes , most of which passed as slowly as if i 'd been sitting naked on an igloo , formula 51 sank...",0,"The text describes the experience of watching ""formula 51"" as unpleasant and slow, using negative imagery such as ""sitting naked on an igloo"" and describing...",0,✔️ [True]


# Evaluate by modification

## Without label change

In [11]:
def evaluate_modified_set(ds, program):
    examples = [
    dspy.Example({ 
                  "text" : remove_space(r['modified_text']), 
                  "original_text": remove_space(r['original_text']),
                  "label": int(r['label']),
                  "modified_label": int(r['label'])
                }
                  ).with_inputs("text") 
    for r in ds
    ]
    evaluate = Evaluate(devset= examples, metric=eval_metric, num_threads=6, display_progress=True, display_table=1, return_outputs= True, return_all_scores=True)
    results = evaluate(program)
    return results

In [12]:
class Sentiment(dspy.Signature):
    """Classify sentiment of the given text. Answer with 1 for positive, 0 for negative."""
    text = dspy.InputField()
    label = dspy.OutputField(prefix = 'Answer:')
class SimpleSentiment(dspy.Module):
    def __init__(self):
        super().__init__()
        self.prog = dspy.Predict(Sentiment)

    def forward(self, text):
        return self.prog(text=text)
simple_sentiment = SimpleSentiment()

In [16]:

# Configure GPT-4 as the language model
lm = dspy.LM('openai/gpt-4o', temperature=0, max_tokens=300)
dspy.configure(lm=lm)
original_pred_ds = pd.read_csv('results/sa/gpt4o-0shot-sst2.csv', index_col=False)
original_pred_ds['text'] = original_pred_ds['text'].apply(remove_space)  # Replace 'your_function' with the actual function

# Get all json files in the specified directory
json_files = glob.glob('../data/modified_data/sa/*_100.json')

for json_file in json_files:
    # Load the json file
    print(json_file)
    if 'grammatical_role' in json_file or 'negation' in json_file:
        continue
    with open(json_file, 'r') as f:
        data = json.load(f)
    
    results = evaluate_modified_set(data,simple_sentiment)

    
    # Convert results to dataframe
    items = []
    for sample in results[1]:
        item = {}
        sentence = sample[0]['text']
        label = sample[0]['label'] 
        pred = sample[1]['label']
        item['text'] = sentence
        item['modified_label'] = label
        pred = extract_prediction(pred)
        item['modified_pred'] = pred
        original_text = sample[0]['original_text'].encode('utf-8').decode('unicode-escape')
        # escaped_string.encode('utf-8').decode('unicode-escape')
        # print(original_text)
        item['original_label'] = sample[0]['label']
        item['original_text'] = original_text
        item['original_pred'] = original_pred_ds.loc[original_pred_ds['text'] == original_text]['pred'].values[0]
        items.append(item)
    
    df_result = pd.DataFrame(data=items)
    
    # Save results with filename based on input json
    output_filename = f"results/sa/gpt4o-0shot-{json_file.split('/')[-1].replace('.json', '')}.csv"
    df_result.to_csv(output_filename)


../preprocessing/data_after_phase2/yulia/casual_100.json


Average Metric: 93 / 100  (93.0): 100%|██████████| 100/100 [00:00<00:00, 772.96it/s]


Unnamed: 0,text,original_text,example_label,modified_label,pred_label,eval_metric
0,it shows that fincher is a director who skillfully uses tech skills to dig into what makes people tick.,it confirms fincher's status as a film maker who artfully bends technical know-how to the service of psychological insight.,1,1,1,✔️ [True]


../preprocessing/data_after_phase2/yulia/discourse_100.json


Average Metric: 86 / 99  (86.9): 100%|██████████| 99/99 [00:04<00:00, 23.67it/s] 


Unnamed: 0,text,original_text,example_label,modified_label,pred_label,eval_metric
0,"too often, the viewer isn't reacting to humor, rather they are wincing back in repugnance.","too often, the viewer isn't reacting to humor so much as they are wincing back in repugnance.",0,0,0,✔️ [True]


../preprocessing/data_after_phase2/yulia/compound_word_100.json


Average Metric: 87 / 95  (91.6): 100%|██████████| 95/95 [00:00<00:00, 732.79it/s]


Unnamed: 0,text,original_text,example_label,modified_label,pred_label,eval_metric
0,"escaping the studio, piccoli is warmly affecting and so is this adroitly minimalist film-making.","escaping the studio, piccoli is warmly affecting and so is this adroitly minimalist movie.",1,1,1,✔️ [True]


../preprocessing/data_after_phase2/yulia/temporal_bias_100.json


Average Metric: 90 / 100  (90.0): 100%|██████████| 100/100 [00:00<00:00, 769.66it/s]


Unnamed: 0,text,original_text,example_label,modified_label,pred_label,eval_metric
0,i'll wager the video game is a lot more fun than the film.,i'll bet the video game is a lot more fun than the film.,0,0,0,✔️ [True]


../preprocessing/data_after_phase2/yulia/coordinating_conjunction_100.json


Average Metric: 93 / 100  (93.0): 100%|██████████| 100/100 [00:00<00:00, 134.16it/s]


Unnamed: 0,text,original_text,example_label,modified_label,pred_label,eval_metric
0,"the far future may be awesome to consider, but from period detail to matters of the heart, this film is most transporting and captivating when...","the far future may be awesome to consider, but from period detail to matters of the heart, this film is most transporting when it stays...",1,1,1,✔️ [True]


../preprocessing/data_after_phase2/yulia/capitalization_100.json


Average Metric: 91 / 99  (91.9): 100%|██████████| 99/99 [00:00<00:00, 728.03it/s] 


Unnamed: 0,text,original_text,example_label,modified_label,pred_label,eval_metric
0,THIS movie is maddening.,this movie is maddening.,0,0,0,✔️ [True]


../preprocessing/data_after_phase2/yulia/dialectal_100.json


Average Metric: 81 / 100  (81.0): 100%|██████████| 100/100 [00:00<00:00, 432.74it/s]


Unnamed: 0,text,original_text,example_label,modified_label,pred_label,eval_metric
0,"the reality of the new live-action pinocchio he has directed, cowritten and starred in is bordering on the grotesque.","the reality of the new live-action pinocchio he directed, cowrote and starred in borders on the grotesque.",0,0,0,✔️ [True]


../preprocessing/data_after_phase2/yulia/sentiment_100.json


Average Metric: 82 / 100  (82.0): 100%|██████████| 100/100 [00:00<00:00, 894.73it/s]


Unnamed: 0,text,original_text,example_label,modified_label,pred_label,eval_metric
0,"may be far from the best of the series, but it's assured, wonderfully respectful of its past and thrilling enough to make it abundantly clear...","may be far from the best of the series, but it's assured, wonderfully respectful of its past and thrilling enough to make it abundantly clear...",1,1,0,


../preprocessing/data_after_phase2/yulia/grammatical_role_100.json
../preprocessing/data_after_phase2/yulia/length_bias_100.json


Average Metric: 93 / 100  (93.0): 100%|██████████| 100/100 [00:00<00:00, 795.86it/s]


Unnamed: 0,text,original_text,example_label,modified_label,pred_label,eval_metric
0,... a truly magnificent drama that is well worth tracking down.,... a magnificent drama well worth tracking down.,1,1,1,✔️ [True]


../preprocessing/data_after_phase2/yulia/concept_replacement_100.json


Average Metric: 88 / 100  (88.0): 100%|██████████| 100/100 [00:00<00:00, 124.85it/s]


Unnamed: 0,text,original_text,example_label,modified_label,pred_label,eval_metric
0,or doing last year's paperwork with your ex-wife.,or doing last year's taxes with your ex-wife.,0,0,0,✔️ [True]


../preprocessing/data_after_phase2/yulia/typo_bias_100.json


Average Metric: 92 / 100  (92.0): 100%|██████████| 100/100 [00:00<00:00, 805.57it/s]


Unnamed: 0,text,original_text,example_label,modified_label,pred_label,eval_metric
0,moretti's compellling anatomy of grief and the difficult process of adapting to loss.,moretti's compelling anatomy of grief and the difficult process of adapting to loss.,0,0,0,✔️ [True]


../preprocessing/data_after_phase2/yulia/geographical_bias_100.json


Average Metric: 91 / 100  (91.0): 100%|██████████| 100/100 [00:00<00:00, 813.69it/s]


Unnamed: 0,text,original_text,example_label,modified_label,pred_label,eval_metric
0,the affectionate quirkiness that once seemed inherent to Seewoosagur's viewpoint struggles to shine through amidst the outdated charming mystery plot and the modern Bollywood-style post-production...,the affectionate loopiness that once seemed congenital to demme's perspective has a tough time emerging from between the badly dated cutesy-pie mystery scenario and the...,0,0,0,✔️ [True]


../preprocessing/data_after_phase2/yulia/punctuation_100.json


Average Metric: 91 / 100  (91.0): 100%|██████████| 100/100 [00:00<00:00, 395.38it/s]


Unnamed: 0,text,original_text,example_label,modified_label,pred_label,eval_metric
0,"just one bad idea, after another.",just one bad idea after another.,0,0,0,✔️ [True]


../preprocessing/data_after_phase2/yulia/derivation_100.json


Average Metric: 86 / 95  (90.5): 100%|██████████| 95/95 [00:00<00:00, 867.08it/s] 


Unnamed: 0,text,original_text,example_label,modified_label,pred_label,eval_metric
0,"(t) here's only so much anyone can do with a showy, overplotted, anne rice rock'n' roll vampire novel before the built-in silliness of the whole...","(t) here's only so much anyone can do with a florid, overplotted, anne rice rock 'n' roll vampire novel before the built-in silliness of the...",0,0,0,✔️ [True]


../preprocessing/data_after_phase2/yulia/active_to_passive_100.json


Average Metric: 89 / 100  (89.0): 100%|██████████| 100/100 [00:00<00:00, 136.68it/s]


Unnamed: 0,text,original_text,example_label,modified_label,pred_label,eval_metric
0,"As the two leads, Lathan and Diggs are seen as charming and are perceived to have chemistry both as friends and lovers.","as the two leads, lathan and diggs are charming and have chemistry both as friends and lovers.",1,1,1,✔️ [True]


../preprocessing/data_after_phase2/yulia/negation_100.json


## With label change

In [9]:
def evaluate_modified_set(ds, program):
    examples = [
    dspy.Example({ 
                  "text" : remove_space(r['modified_text']), 
                  "original_text": remove_space(r['original_text']),
                  "label": int(r['modified_label']) if r.get('modified_label') != None else int(r['label']),
                  "original_label": int(r['label']),
                  "type": r['type'] if r.get('type') != None else None
                }
                  ).with_inputs("text") 
    for r in ds
    ]
    evaluate = Evaluate(devset= examples, metric=eval_metric, num_threads=6, display_progress=True, display_table=1, return_outputs= True, return_all_scores=True)
    results = evaluate(program)
    return results

In [10]:
class Sentiment(dspy.Signature):
    """Classify sentiment of the given text. Answer with 1 for positive, 0 for negative."""
    text = dspy.InputField()
    label = dspy.OutputField(prefix = 'Answer:')
class SimpleSentiment(dspy.Module):
    def __init__(self):
        super().__init__()
        self.prog = dspy.Predict(Sentiment)

    def forward(self, text):
        return self.prog(text=text)
simple_sentiment = SimpleSentiment()

In [11]:
original_pred_ds = pd.read_csv('results/sa/gpt4o-0shot-sst2.csv', index_col=False)
original_pred_ds['text'] = original_pred_ds['text'].apply(remove_space)  # Replace 'your_function' with the actual function

# Get all json files in the specified directory
json_files = glob.glob('../data/modified_data/sa/*_100.json')

for json_file in json_files:
    # Load the json file
    print(json_file)
    if not any(x in json_file for x in ['sentiment']):
        continue
    with open(json_file, 'r') as f:
        data = json.load(f)
    
    results = evaluate_modified_set(data,simple_sentiment)

    
    # Convert results to dataframe
    items = []
    for sample in results[1]:
        item = {}
        sentence = sample[0]['text']
        pred = sample[1]['label']
        item['text'] = sentence
        item['modified_label'] = sample[0]['label'] 
        pred = extract_prediction(pred)
        item['modified_pred'] = pred
        # original_text = sample[0]['original_text']
        original_text = sample[0]['original_text'].encode('utf-8').decode('unicode-escape')

        # print(original_text)
        item['original_label'] = sample[0]['original_label']
        item['original_text'] = original_text
        item['original_pred'] = original_pred_ds.loc[original_pred_ds['text'] == original_text]['pred'].values[0]
        item['type'] = sample[0]['type']
        items.append(item)
    
    df_result = pd.DataFrame(data=items)
    
    
    # Save results with filename based on input json
    output_filename = f"results/sa/gpt4o-0shot-{json_file.split('/')[-1].replace('.json', '')}.csv"
    df_result.to_csv(output_filename)


../preprocessing/data_after_phase2/yulia/casual_100.json
../preprocessing/data_after_phase2/yulia/discourse_100.json
../preprocessing/data_after_phase2/yulia/compound_word_100.json
../preprocessing/data_after_phase2/yulia/temporal_bias_100.json
../preprocessing/data_after_phase2/yulia/coordinating_conjunction_100.json
../preprocessing/data_after_phase2/yulia/capitalization_100.json
../preprocessing/data_after_phase2/yulia/dialectal_100.json
../preprocessing/data_after_phase2/yulia/sentiment_100.json


Average Metric: 91 / 100  (91.0): 100%|██████████| 100/100 [00:13<00:00,  7.62it/s]


Unnamed: 0,text,original_text,example_label,original_label,type,pred_label,eval_metric
0,"may be far from the best of the series, but it's assured, wonderfully respectful of its past and thrilling enough to make it abundantly clear...","may be far from the best of the series, but it's assured, wonderfully respectful of its past and thrilling enough to make it abundantly clear...",0,1,sentiment,0,✔️ [True]


../preprocessing/data_after_phase2/yulia/grammatical_role_100.json
../preprocessing/data_after_phase2/yulia/length_bias_100.json
../preprocessing/data_after_phase2/yulia/concept_replacement_100.json
../preprocessing/data_after_phase2/yulia/typo_bias_100.json
../preprocessing/data_after_phase2/yulia/geographical_bias_100.json
../preprocessing/data_after_phase2/yulia/punctuation_100.json
../preprocessing/data_after_phase2/yulia/derivation_100.json
../preprocessing/data_after_phase2/yulia/active_to_passive_100.json
../preprocessing/data_after_phase2/yulia/negation_100.json


# Aggregate results

In [43]:
from scipy import stats

In [44]:
result_files = glob.glob('results/sa/gpt4o-0shot-*_100.csv')

aggregated_results = []

for file in result_files:
    # Extract modification type from filename
    mod_type = file.split('-')[-1].replace('.csv','')
    
    # Read results file
    df = pd.read_csv(file)

    # Calculate accuracies
    original_correct = (df['original_pred'] == df['original_label']).sum()
    modified_correct = (df['modified_pred'] == df['modified_label']).sum()
    total = len(df)

    original_acc = original_correct / total
    modified_acc = modified_correct / total
    
    # Calculate the difference between original_res and modified_res
    difference = -round(original_acc - modified_acc, 2)
    
    # Calculate percentage difference with respect to total samples
    pct_difference = -round((original_correct - modified_correct) / original_correct * 100, 2)
    
    # Perform t-test between original and modified predictions
    t_stat, p_value = stats.ttest_ind(
        (df['original_pred'] == df['original_label']).astype(float),
        (df['modified_pred'] == df['modified_label']).astype(float)
    )
    
    aggregated_results.append({
        'task': 'dialogue_contradiction_detection',
        'modification': mod_type,
        'original_res': round(original_acc, 2),
        'modified_res': round(modified_acc, 2),
        'difference': difference,  # Difference in accuracy
        'pct_difference': pct_difference,  # Percentage difference relative to total samples
        'p_value': p_value  # Add p-value from t-test
    })

# Create final results dataframe
results_df = pd.DataFrame(aggregated_results)

# Sort the results based on modification_name
modification_name = ['temporal_bias_100', 'geographical_bias_100','length_bias_100', 'typo_bias_100', 'capitalization_100', 'punctuation_100', 'derivation_100', 'compound_word_100','active_to_passive_100','grammatical_role_100', 'coordinating_conjunction_100', 'concept_replacement_100','negation_100','discourse_100','sentiment_100','casual_100', 'dialectal_100']
results_df['modification'] = pd.Categorical(results_df['modification'], categories=modification_name, ordered=True)
results_df = results_df.sort_values(by='modification')

# Calculate averages across all modifications
avg_original = results_df['original_res'].mean()
avg_modified = results_df['modified_res'].mean()
avg_difference = avg_original - avg_modified
avg_pct_difference = results_df['pct_difference'].mean()

# Add averages as a new row
results_df.loc[len(results_df)] = {
    'task': 'dialogue_contradiction_detection',
    'modification': 'average',
    'original_res': round(avg_original, 2),
    'modified_res': round(avg_modified, 2),
    'difference': -round(avg_difference, 2),
    'pct_difference': round(avg_pct_difference, 2),
    'p_value': None  # No p-value for average row
}

print("\n")
results_df.to_csv('results/sa/gpt4o-DP.csv')

# Apply styling to highlight rows where original_res > modified_res and significant p-values
def highlight_drops_and_significance(row):
    colors = [''] * len(row)
    if row['original_res'] > row['modified_res']:
        colors = ['background-color: red'] * len(row)
        # If p-value < 0.05, add bold text
        if 'p_value' in row and row['p_value'] is not None and row['p_value'] < 0.05:
            colors = ['background-color: red; font-weight: bold'] * len(row)
    return colors

results_df.round(2).style.apply(highlight_drops_and_significance, axis=1)






  results_df.loc[len(results_df)] = {


Unnamed: 0,task,modification,original_res,modified_res,difference,pct_difference,p_value
8,dialogue_contradiction_detection,temporal_bias_100,0.9,0.9,-0.0,-0.0,1.0
2,dialogue_contradiction_detection,geographical_bias_100,0.92,0.91,-0.01,-1.09,0.8
11,dialogue_contradiction_detection,length_bias_100,0.92,0.93,0.01,1.09,0.79
13,dialogue_contradiction_detection,typo_bias_100,0.92,0.92,-0.0,-0.0,1.0
3,dialogue_contradiction_detection,capitalization_100,0.91,0.92,0.01,1.11,0.8
10,dialogue_contradiction_detection,punctuation_100,0.9,0.91,0.01,1.11,0.81
7,dialogue_contradiction_detection,derivation_100,0.93,0.91,-0.02,-2.47,0.58
16,dialogue_contradiction_detection,compound_word_100,0.94,0.92,-0.02,-2.25,0.58
6,dialogue_contradiction_detection,active_to_passive_100,0.93,0.89,-0.04,-4.3,0.33
1,dialogue_contradiction_detection,grammatical_role_100,0.91,0.92,0.02,1.67,0.76


In [28]:
# Load results from different models
gpt4_df = pd.read_csv('results/sa/gpt4o-0shot-sst2.csv')
claude_df = pd.read_csv('results/sa/claude-3-5-sonnet-0shot-sst2.csv')
mixtral_df = pd.read_csv('results/sa/mixtral-8x22b-sst2.csv')

# Calculate accuracy between predictions and labels
gpt4_acc = (gpt4_df['pred'] == gpt4_df['label']).mean()
claude_acc = (claude_df['pred'] == claude_df['label']).mean()
mixtral_acc = (mixtral_df['pred'] == mixtral_df['label']).mean()
# Calculate average accuracy for each model
print(f"GPT-4 Average Accuracy: {gpt4_acc:.2%}")
print(f"Claude-3.5 Average Accuracy: {claude_acc:.2%}")
print(f"Mixtral Average Accuracy: {mixtral_acc:.2%}")

# Create comparison dataframe
comparison_df = pd.DataFrame({
    'Model': ['GPT-4', 'Claude-3.5', 'Mixtral'],
    'Accuracy': [gpt4_acc, claude_acc, mixtral_acc]
})

# Style the dataframe
def highlight_max(s):
    is_max = s == s.max()
    return ['background-color: green' if v else '' for v in is_max]

styled_df = comparison_df.style.apply(highlight_max, subset=['Accuracy'])
styled_df


GPT-4 Average Accuracy: 93.69%
Claude-3.5 Average Accuracy: 91.51%
Mixtral Average Accuracy: 83.37%


Unnamed: 0,Model,Accuracy
0,GPT-4,0.936927
1,Claude-3.5,0.915138
2,Mixtral,0.833716
