In [7]:
from datasets import load_dataset
import dspy
import openai
import os
import re
import pandas as pd
import json
import random
from dotenv import load_dotenv
import glob


In [8]:
load_dotenv()



True

In [9]:
openai.api_key = os.getenv('OPENAI_API_KEY')
openai.organization = os.getenv('OPENAI_ORGANIZATION')


In [10]:
lm = dspy.LM('openrouter/anthropic/claude-3.5-sonnet', temperature=0, max_tokens=1024)
dspy.configure(lm=lm)

In [11]:
# ds = pd.read_json('data/sent/test.json', lines= True)
# ds = ds.to_dict('records')
# ds = load_dataset('stanfordnlp/sst2')['validation']
ds = pd.read_json('../data/train_dev_test_data/sent/test.json')
ds = ds.to_dict('records')



In [12]:
ds

[{'idx': 0,
  'sentence': "it's a charming and often affecting journey.",
  'label': 1},
 {'idx': 1, 'sentence': 'unflinchingly bleak and desperate', 'label': 0},
 {'idx': 2,
  'sentence': 'allows us to hope that nolan is poised to embark a major career as a commercial yet inventive filmmaker.',
  'label': 1},
 {'idx': 3,
  'sentence': "the acting, costumes, music, cinematography and sound are all astounding given the production's austere locales.",
  'label': 1},
 {'idx': 4, 'sentence': "it's slow -- very, very slow.", 'label': 0},
 {'idx': 5,
  'sentence': 'although laced with humor and a few fanciful touches, the film is a refreshingly serious look at young women.',
  'label': 1},
 {'idx': 6, 'sentence': 'a sometimes tedious film.', 'label': 0},
 {'idx': 7,
  'sentence': "or doing last year's taxes with your ex-wife.",
  'label': 0},
 {'idx': 8,
  'sentence': "you don't have to know about music to appreciate the film's easygoing blend of comedy and romance.",
  'label': 1},
 {'idx':

In [13]:
def remove_space(text):
    """Clean up spacing and formatting in dialogue text."""
    lines = text.split('\n')
    cleaned_lines = []
    
    for line in lines:
        # Remove multiple spaces
        cleaned = ' '.join(line.split())
        
        # Fix spacing around punctuation
        cleaned = re.sub(r'\s+([.,!?:;])', r'\1', cleaned)
        cleaned = re.sub(r'([.,!?:;])\s+', r'\1 ', cleaned)
        
        # Fix contractions
        cleaned = re.sub(r'\s*\'\s*s\b', "'s", cleaned)
        cleaned = re.sub(r'\s*n\s*\'\s*t\b', "n't", cleaned)
        cleaned = re.sub(r'\s*\'\s*ve\b', "'ve", cleaned)
        cleaned = re.sub(r'\s*\'\s*re\b', "'re", cleaned)
        cleaned = re.sub(r'\s*\'\s*ll\b', "'ll", cleaned)
        cleaned = re.sub(r'\s*\'\s*d\b', "'d", cleaned)
        cleaned = re.sub(r'\s*\'\s*m\b', "'m", cleaned)
        
        # Fix spaces around parentheses
        cleaned = re.sub(r'\(\s+', '(', cleaned)
        cleaned = re.sub(r'\s+\)', ')', cleaned)
        
        # Remove leading/trailing whitespace
        cleaned = cleaned.strip()
        
        cleaned_lines.append(cleaned)
        
    return '\n'.join(cleaned_lines)


In [14]:
examples = [
    dspy.Example({ 
                  "text": remove_space(r["sentence"]), 
                  "label": r["label"]}
                  
                  ).with_inputs("text") 
    for r in ds
    ]


In [15]:
len(examples)

872

In [16]:
example = examples[835]
for k, v in example.items():
    print(f"\n{k.upper()}:\n")
    print(v)



TEXT:

a giggle-inducing comedy with snappy dialogue and winning performances by an unlikely team of oscar-winners: susan sarandon and goldie hawn.

LABEL:

1


In [17]:
def extract_prediction(text):
    matches = re.findall(r'\b[0-2]\b', text)
    # print(matches)
    parsed_answer = matches[-1] if matches else ""
    return parsed_answer

In [18]:
def eval_metric(true, prediction, trace=None):
    pred = prediction.label
    matches = re.findall(r'\b[0-2]\b', pred)
    parsed_answer = matches[-1] if matches else ""
    # print(parsed_answer)
    return parsed_answer == str(true.label)

# Evaluate the original test set

In [19]:
from dspy.evaluate import Evaluate

evaluate = Evaluate(devset= examples, metric=eval_metric, num_threads=6, display_progress=True, display_table=10)


In [20]:
class Sentiment(dspy.Signature):
    """Classify sentiment of the given text. Answer with 1 for positive, 0 for negative."""
    text = dspy.InputField()
    label = dspy.OutputField(prefix = 'Answer:')

In [21]:
class SimpleSentiment(dspy.Module):
    def __init__(self):
        super().__init__()
        self.prog = dspy.Predict(Sentiment)

    def forward(self, text):
        return self.prog(text=text)


In [22]:
simple_sentiment = SimpleSentiment()

In [23]:
pred = simple_sentiment(text=example.text)
print("\nQUESTION:\n")
print(example.text)
# print("\nANSWER:\n")
# print(example.label)
print("\nPREDICTION:\n")
print(pred)



QUESTION:

a giggle-inducing comedy with snappy dialogue and winning performances by an unlikely team of oscar-winners: susan sarandon and goldie hawn.

PREDICTION:

Prediction(
    label='1'
)


In [27]:
eval_metric(example, pred)

True

In [28]:
evaluate = Evaluate(devset= examples, metric=eval_metric, num_threads=6, display_progress=True, display_table=10, return_outputs= True, return_all_scores=True)
results = evaluate(simple_sentiment)
items = []
for sample in results[1]:
    item = {}
    sentence = sample[0]['text']
    label = sample[0]['label']
    pred = sample[1]['label']
    item['text'] = sentence
    item['label'] = label
    item['pred'] = pred
    items.append(item)
df_result = pd.DataFrame(data = items)
df_result.to_csv('results/sent/claude-3-5-sonnet-0shot-sst2.csv')

  0%|          | 0/872 [00:00<?, ?it/s]

Average Metric: 833 / 872  (95.5): 100%|██████████| 872/872 [02:15<00:00,  6.43it/s]


Unnamed: 0,text,example_label,pred_label,eval_metric
0,it's a charming and often affecting journey.,1,1,✔️ [True]
1,unflinchingly bleak and desperate,0,0,✔️ [True]
2,allows us to hope that nolan is poised to embark a major career as a commercial yet inventive filmmaker.,1,1,✔️ [True]
3,"the acting, costumes, music, cinematography and sound are all astounding given the production's austere locales.",1,1,✔️ [True]
4,"it's slow -- very, very slow.",0,0,✔️ [True]
5,"although laced with humor and a few fanciful touches, the film is a refreshingly serious look at young women.",1,1,✔️ [True]
6,a sometimes tedious film.,0,0,✔️ [True]
7,or doing last year's taxes with your ex-wife.,0,0,✔️ [True]
8,you don't have to know about music to appreciate the film's easygoing blend of comedy and romance.,1,1,✔️ [True]
9,"in exactly 89 minutes, most of which passed as slowly as if i'd been sitting naked on an igloo, formula 51 sank from quirky to...",0,0,✔️ [True]


In [29]:
class CoTSentiment(dspy.Module):
    def __init__(self):
        super().__init__()
        self.prog = dspy.ChainOfThought(Sentiment)

    def forward(self, text):
        return self.prog(text=text)


In [30]:
cot_sentiment = CoTSentiment()
pred = cot_sentiment(text=example.text)
print("\nQUESTION:\n")
print(example.text)
# print("\nANSWER:\n")
# print(example.label)
print("\nPREDICTION:\n")
print(pred)



QUESTION:

a giggle-inducing comedy with snappy dialogue and winning performances by an unlikely team of oscar-winners: susan sarandon and goldie hawn.

PREDICTION:

Prediction(
    reasoning='The text describes the movie as "giggle-inducing" and having "snappy dialogue" and "winning performances", which are all positive descriptors. Additionally, it mentions the involvement of Oscar-winners, which adds to the prestige and credibility of the movie. The tone of the text is overwhelmingly positive, indicating a favorable sentiment towards the movie.',
    label='1'
)


In [108]:
lm.inspect_history()





[34m[2024-11-14T19:10:57.926980][0m

[31mSystem message:[0m

Your input fields are:
1. `text` (str)

Your output fields are:
1. `reasoning` (str)
2. `label` (str)

All interactions will be structured in the following way, with the appropriate values filled in.

[[ ## text ## ]]
{text}

[[ ## reasoning ## ]]
{reasoning}

[[ ## label ## ]]
{label}

[[ ## completed ## ]]

In adhering to this structure, your objective is: 
        Classify sentiment of the given text. Answer with 1 for positive, 0 for negative, or 2 for neutral.


[31mUser message:[0m

[[ ## text ## ]]
it 's a charming and often affecting journey .

Respond with the corresponding output fields, starting with the field `[[ ## reasoning ## ]]`, then `[[ ## label ## ]]`, and then ending with the marker for `[[ ## completed ## ]]`.


[31mResponse:[0m

[32m[[ ## reasoning ## ]]
The text describes the journey as "charming" and "often affecting," which are positive adjectives. "Charming" suggests that the journey is 

In [42]:
evaluate = Evaluate(devset= examples, metric=eval_metric, num_threads=6, display_progress=True, display_table=10, return_outputs= True, return_all_scores=True)
results = evaluate(cot_sentiment)
items = []
for sample in results[1]:
    item = {}
    sentence = sample[0]['text']
    label = sample[0]['label']
    pred = sample[1]['label']
    item['text'] = sentence
    item['label'] = label
    item['pred'] = pred
    items.append(item)
df_result = pd.DataFrame(data = items)
df_result.to_csv('claude-3-5-sonnet-0shot-cot-sst2.csv')

Average Metric: 760 / 872  (87.2): 100%|██████████| 872/872 [05:35<00:00,  2.60it/s]


Unnamed: 0,text,example_label,reasoning,pred_label,eval_metric
0,it 's a charming and often affecting journey .,1,"The text describes the journey as ""charming"" and ""often affecting,"" which are positive adjectives. ""Charming"" suggests that the journey is delightful and pleasing, while ""affecting""...",1,✔️ [True]
1,unflinchingly bleak and desperate,0,"The text ""unflinchingly bleak and desperate"" uses words that convey a strong sense of negativity. ""Bleak"" suggests a lack of hope or positivity, and ""desperate""...",0,✔️ [True]
2,allows us to hope that nolan is poised to embark a major career as a commercial yet inventive filmmaker .,1,"The text expresses optimism and hope regarding Nolan's future career, suggesting that he is on the verge of achieving significant success as a filmmaker who...",1,✔️ [True]
3,"the acting , costumes , music , cinematography and sound are all astounding given the production 's austere locales .",1,"The text praises multiple aspects of the production, such as acting, costumes, music, cinematography, and sound, describing them as ""astounding."" This indicates a strong positive...",1,✔️ [True]
4,"it 's slow -- very , very slow .",0,"The text describes something as ""slow"" and emphasizes this by repeating ""very, very slow."" This repetition suggests a negative sentiment, as slowness is typically not...",0,✔️ [True]
5,"although laced with humor and a few fanciful touches , the film is a refreshingly serious look at young women .",1,"The text describes the film as ""refreshingly serious"" and mentions it is ""laced with humor and a few fanciful touches,"" which suggests a positive sentiment....",1,✔️ [True]
6,a sometimes tedious film .,0,"The text describes the film as ""sometimes tedious,"" which suggests a negative sentiment. The word ""tedious"" implies that the film can be boring or monotonous...",0,✔️ [True]
7,or doing last year 's taxes with your ex-wife .,0,"The text refers to the task of doing taxes with an ex-wife, which is likely to be an uncomfortable or unpleasant situation due to the...",0,✔️ [True]
8,you do n't have to know about music to appreciate the film 's easygoing blend of comedy and romance .,1,"The text suggests that the film is enjoyable and accessible even to those who are not knowledgeable about music. The use of the words ""easygoing...",1,✔️ [True]
9,"in exactly 89 minutes , most of which passed as slowly as if i 'd been sitting naked on an igloo , formula 51 sank...",0,"The text describes the experience of watching ""formula 51"" as unpleasant and slow, using negative imagery such as ""sitting naked on an igloo"" and describing...",0,✔️ [True]


# Evaluate by modification

## Without label change

In [41]:
def evaluate_modified_set(ds, program):
    examples = [
    dspy.Example({ 
                  "text" : remove_space(r['modified_text']), 
                  "original_text": remove_space(r['original_text']),
                  "label": int(r['label']),
                  "modified_label": int(r['label'])
                }
                  ).with_inputs("text") 
    for r in ds
    ]
    evaluate = Evaluate(devset= examples, metric=eval_metric, num_threads=6, display_progress=True, display_table=1, return_outputs= True, return_all_scores=True)
    results = evaluate(program)
    return results

In [42]:
class Sentiment(dspy.Signature):
    """Classify sentiment of the given text. Answer with 1 for positive, 0 for negative."""
    text = dspy.InputField()
    label = dspy.OutputField(prefix = 'Answer:')
class SimpleSentiment(dspy.Module):
    def __init__(self):
        super().__init__()
        self.prog = dspy.Predict(Sentiment)

    def forward(self, text):
        return self.prog(text=text)
simple_sentiment = SimpleSentiment()

In [48]:

# Configure GPT-4 as the language model
original_pred_ds = pd.read_csv('results/sent/claude-3-5-sonnet-0shot-sst2.csv', index_col=False)
original_pred_ds['text'] = original_pred_ds['text'].apply(lambda x: remove_space(x).encode('utf-8').decode('unicode-escape'))

# Get all json files in the specified directory
json_files = glob.glob('../data/modified_data/sent/*_100.json')

for json_file in json_files:
    # Load the json file
    print(json_file)
    if 'grammatical_role' in json_file or 'negation' in json_file:
        continue
    with open(json_file, 'r') as f:
        data = json.load(f)
    
    results = evaluate_modified_set(data,simple_sentiment)

    
    # Convert results to dataframe
    items = []
    for sample in results[1]:
        item = {}
        sentence = sample[0]['text']
        label = sample[0]['label'] 
        pred = sample[1]['label']
        item['text'] = sentence
        item['modified_label'] = label
        pred = extract_prediction(pred)
        item['modified_pred'] = pred
        # original_text = sample[0]['original_text']
        original_text = sample[0]['original_text'].encode('utf-8').decode('unicode-escape')
        # original_text = r"{}".format(original_text)
        # print(original_text)
        item['original_label'] = sample[0]['label']
        item['original_text'] = original_text
        item['original_pred'] = original_pred_ds.loc[original_pred_ds['text'] == original_text]['pred'].values[0]
        items.append(item)
    
    df_result = pd.DataFrame(data=items)
    
    # Save results with filename based on input json
    output_filename = f"results/sent/claude-3-5-sonnet-0shot-{json_file.split('/')[-1].replace('.json', '')}.csv"
    df_result.to_csv(output_filename)


../data/modified_data/sent/casual_100.json


Average Metric: 93 / 100  (93.0): 100%|██████████| 100/100 [00:00<00:00, 2604.66it/s]


Unnamed: 0,text,original_text,example_label,modified_label,pred_label,eval_metric
0,it shows that fincher is a director who skillfully uses tech skills to dig into what makes people tick.,it confirms fincher's status as a film maker who artfully bends technical know-how to the service of psychological insight.,1,1,1,✔️ [True]


it confirms fincher's status as a film maker who artfully bends technical know-how to the service of psychological insight.
brilliantly explores the conflict between following one's heart and following the demands of tradition.
two hours fly by -- opera's a pleasure when you don't have to endure intermissions -- and even a novice to the form comes away exhilarated.
filmmakers who can deftly change moods are treasures and even marvels.
his comedy premises are often hackneyed or just plain crude, calculated to provoke shocked laughter, without following up on a deeper level.
suffocated by its fussy script and uptight characters, this musty adaptation is all the more annoying since it's been packaged and sold back to us by hollywood.
no aspirations to social import inform the movie version.
once (kim) begins to overplay the shock tactics and bait-and-tackle metaphors, you may decide it's too high a price to pay for a shimmering picture postcard.
a gripping movie, played with performances 

Average Metric: 91 / 99  (91.9): 100%|██████████| 99/99 [00:00<00:00, 2827.37it/s]


Unnamed: 0,text,original_text,example_label,modified_label,pred_label,eval_metric
0,"too often, the viewer isn't reacting to humor, rather they are wincing back in repugnance.","too often, the viewer isn't reacting to humor so much as they are wincing back in repugnance.",0,0,0,✔️ [True]


too often, the viewer isn't reacting to humor so much as they are wincing back in repugnance.
aside from minor tinkering, this is the same movie you probably loved in 1994, except that it looks even better.
this re-do is so dumb and so exploitative in its violence that, ironically, it becomes everything that the rather clumsy original was railing against.
moody, heartbreaking, and filmed in a natural, unforced style that makes its characters seem entirely convincing even when its script is not.
for the first time in years, de niro digs deep emotionally, perhaps because he's been stirred by the powerful work of his co-stars.
and that's a big part of why we go to the movies.
miller is playing so free with emotions, and the fact that children are hostages to fortune, that he makes the audience hostage to his swaggering affectation of seriousness.
vera's three actors -- mollà, gil and bardem -- excel in insightful, empathetic performances.
an interesting story with a pertinent (cinematical

Average Metric: 91 / 95  (95.8): 100%|██████████| 95/95 [00:00<00:00, 2828.30it/s]


Unnamed: 0,text,original_text,example_label,modified_label,pred_label,eval_metric
0,"escaping the studio, piccoli is warmly affecting and so is this adroitly minimalist film-making.","escaping the studio, piccoli is warmly affecting and so is this adroitly minimalist movie.",1,1,1,✔️ [True]


escaping the studio, piccoli is warmly affecting and so is this adroitly minimalist movie.
a fast, funny, highly enjoyable movie.
drops you into a dizzying, volatile, pressure-cooker of a situation that quickly snowballs out of control, while focusing on the what much more than the why.
it feels like an after-school special gussied up with some fancy special effects, and watching its rote plot points connect is about as exciting as gazing at an egg timer for 93 minutes.
nothing in waking up in reno ever inspired me to think of its inhabitants as anything more than markers in a screenplay.
the far future may be awesome to consider, but from period detail to matters of the heart, this film is most transporting when it stays put in the past.
a compelling spanish film about the withering effects of jealousy in the life of a young monarch whose sexual passion for her husband becomes an obsession.
his comedy premises are often hackneyed or just plain crude, calculated to provoke shocked laug

Average Metric: 94 / 100  (94.0): 100%|██████████| 100/100 [00:00<00:00, 1596.35it/s]


Unnamed: 0,text,original_text,example_label,modified_label,pred_label,eval_metric
0,i'll wager the video game is a lot more fun than the film.,i'll bet the video game is a lot more fun than the film.,0,0,1,


i'll bet the video game is a lot more fun than the film.
a painfully funny ode to bad behavior.
the far future may be awesome to consider, but from period detail to matters of the heart, this film is most transporting when it stays put in the past.
the experience of going to a film festival is a rewarding one; the experiencing of sampling one through this movie is not.
a misogynistic piece of filth that attempts to pass itself off as hip, young adult entertainment.
broomfield turns his distinctive ` blundering ' style into something that could really help clear up the case.
there are simply too many ideas floating around -- part farce, part sliding doors, part pop video -- and yet failing to exploit them.
nervous breakdowns are not entertaining.
deliriously funny, fast and loose, accessible to the uninitiated, and full of surprises
fun, flip and terribly hip bit of cinematic entertainment.
it deserves to be seen by anyone with even a passing interest in the events shaping the world bey

Average Metric: 96 / 100  (96.0): 100%|██████████| 100/100 [00:17<00:00,  5.76it/s]


Unnamed: 0,text,original_text,example_label,modified_label,pred_label,eval_metric
0,"the far future may be awesome to consider, but from period detail to matters of the heart, this film is most transporting and captivating when...","the far future may be awesome to consider, but from period detail to matters of the heart, this film is most transporting when it stays...",1,1,1,✔️ [True]


the far future may be awesome to consider, but from period detail to matters of the heart, this film is most transporting when it stays put in the past.
the film's welcome breeziness and some unbelievably hilarious moments -- most portraying the idiocy of the film industry -- make it mostly worth the trip.
that's pure pr hype.
adults will wish the movie were less simplistic, obvious, clumsily plotted and shallowly characterized.
one of the more intelligent children's movies to hit theaters this year.
hilariously inept and ridiculous.
few films capture so perfectly the hopes and dreams of little boys on baseball fields as well as the grown men who sit in the stands.
this re-do is so dumb and so exploitative in its violence that, ironically, it becomes everything that the rather clumsy original was railing against.
a giggle a minute.
does little more than play an innocuous game of fill-in - the-blanks with a tragic past.
it's a stunning lyrical work of considerable force and truth.
too o

Average Metric: 96 / 99  (97.0): 100%|██████████| 99/99 [00:13<00:00,  7.40it/s] 


Unnamed: 0,text,original_text,example_label,modified_label,pred_label,eval_metric
0,THIS movie is maddening.,this movie is maddening.,0,0,0,✔️ [True]


this movie is maddening.
this re-do is so dumb and so exploitative in its violence that, ironically, it becomes everything that the rather clumsy original was railing against.
no aspirations to social import inform the movie version.
teen movies have really hit the skids.
taylor appears to have blown his entire budget on soundtrack rights and had nothing left over for jokes.
just one bad idea after another.
too much of it feels unfocused and underdeveloped.
indifferently implausible popcorn programmer of a movie.
it has its moments of swaggering camaraderie, but more often just feels generic, derivative and done to death.
a valueless kiddie paean to pro basketball underwritten by the nba.
it takes a strange kind of laziness to waste the talents of robert forster, anne meara, eugene levy, and reginald veljohnson all in the same movie.
without non-stop techno or the existential overtones of a kieslowski morality tale, maelström is just another winter sleepers.
slapstick buffoonery can ti

Average Metric: 93 / 100  (93.0): 100%|██████████| 100/100 [01:35<00:00,  1.05it/s]


Unnamed: 0,text,original_text,example_label,modified_label,pred_label,eval_metric
0,"the reality of the new live-action pinocchio he has directed, cowritten and starred in is bordering on the grotesque.","the reality of the new live-action pinocchio he directed, cowrote and starred in borders on the grotesque.",0,0,0,✔️ [True]


the reality of the new live-action pinocchio he directed, cowrote and starred in borders on the grotesque.
a coda in every sense, the pinochet case splits time between a minute-by-minute account of the british court's extradition chess game and the regime's talking-head survivors.
just one bad idea after another.
birthday girl is an amusing joy ride, with some surprisingly violent moments.
there's a wickedly subversive bent to the best parts of birthday girl.
a smart, witty follow-up.
what really makes it special is that it pulls us into its world, gives us a hero whose suffering and triumphs we can share, surrounds him with interesting characters and sends us out of the theater feeling we've shared a great adventure.
while the ideas about techno-saturation are far from novel, they're presented with a wry dark humor.
brilliantly explores the conflict between following one's heart and following the demands of tradition.
taylor appears to have blown his entire budget on soundtrack rights

Average Metric: 91 / 100  (91.0): 100%|██████████| 100/100 [00:14<00:00,  6.97it/s]


Unnamed: 0,text,original_text,example_label,modified_label,pred_label,eval_metric
0,"may be far from the best of the series, but it's assured, wonderfully respectful of its past and thrilling enough to make it abundantly clear...","may be far from the best of the series, but it's assured, wonderfully respectful of its past and thrilling enough to make it abundantly clear...",1,1,1,✔️ [True]


may be far from the best of the series, but it's assured, wonderfully respectful of its past and thrilling enough to make it abundantly clear that this movie phenomenon has once again reinvented itself for a new generation.
a compelling spanish film about the withering effects of jealousy in the life of a young monarch whose sexual passion for her husband becomes an obsession.
there's really only one good idea in this movie, but the director runs with it and presents it with an unforgettable visual panache.
detox is ultimately a pointless endeavor.
trite, banal, cliched, mostly inoffensive.
too slow, too long and too little happens.
whether writer-director anne fontaine's film is a ghost story, an account of a nervous breakdown, a trip down memory lane, all three or none of the above, it is as seductive as it is haunting.
too often, the viewer isn't reacting to humor so much as they are wincing back in repugnance.
one of the more intelligent children's movies to hit theaters this year.

Average Metric: 96 / 100  (96.0): 100%|██████████| 100/100 [00:13<00:00,  7.50it/s]


Unnamed: 0,text,original_text,example_label,modified_label,pred_label,eval_metric
0,... a truly magnificent drama that is well worth tracking down.,... a magnificent drama well worth tracking down.,1,1,1,✔️ [True]


... a magnificent drama well worth tracking down.
the affectionate loopiness that once seemed congenital to demme's perspective has a tough time emerging from between the badly dated cutesy-pie mystery scenario and the newfangled hollywood post-production effects.
teen movies have really hit the skids.
the character of zigzag is not sufficiently developed to support a film constructed around him.
the most compelling wiseman epic of recent years.
another in-your-face wallow in the lower depths made by people who have never sung those blues.
two hours fly by -- opera's a pleasure when you don't have to endure intermissions -- and even a novice to the form comes away exhilarated.
it's a work by an artist so in control of both his medium and his message that he can improvise like a jazzman.
it takes a strange kind of laziness to waste the talents of robert forster, anne meara, eugene levy, and reginald veljohnson all in the same movie.
villeneuve spends too much time wallowing in bibi's ge

Average Metric: 96 / 100  (96.0): 100%|██████████| 100/100 [00:21<00:00,  4.74it/s]


Unnamed: 0,text,original_text,example_label,modified_label,pred_label,eval_metric
0,or doing last year's paperwork with your ex-wife.,or doing last year's taxes with your ex-wife.,0,0,0,✔️ [True]


or doing last year's taxes with your ex-wife.
it's an offbeat treat that pokes fun at the democratic exercise while also examining its significance for those who take part.
partway through watching this saccharine, easter-egg-colored concoction, you realize that it is made up of three episodes of a rejected tv show.
villeneuve spends too much time wallowing in bibi's generic angst (there are a lot of shots of her gazing out windows).
it haunts, horrifies, startles and fascinates; it is impossible to look away.
the acting, costumes, music, cinematography and sound are all astounding given the production's austere locales.
a misogynistic piece of filth that attempts to pass itself off as hip, young adult entertainment.
lapaglia's ability to convey grief and hope works with weaver's sensitive reactions to make this a two-actor master class.
a painfully funny ode to bad behavior.
it's not the ultimate depression-era gangster movie.
the film makes a fatal mistake: it asks us to care about a

Average Metric: 94 / 100  (94.0): 100%|██████████| 100/100 [00:13<00:00,  7.53it/s]


Unnamed: 0,text,original_text,example_label,modified_label,pred_label,eval_metric
0,moretti's compellling anatomy of grief and the difficult process of adapting to loss.,moretti's compelling anatomy of grief and the difficult process of adapting to loss.,0,0,1,


moretti's compelling anatomy of grief and the difficult process of adapting to loss.
it's of the quality of a lesser harrison ford movie - six days, seven nights, maybe, or that dreadful sabrina remake.
this piece of channel 5 grade trash is, quite frankly, an insult to the intelligence of the true genre enthusiast.
a bloated gasbag thesis grotesquely impressed by its own gargantuan aura of self-importance...
outer-space buffs might love this film, but others will find its pleasures intermittent.
late marriage's stiffness is unlikely to demonstrate the emotional clout to sweep u.s. viewers off their feet.
the cold turkey would've been a far better title.
this movie is maddening.
not really bad so much as distasteful: we need kidnapping suspense dramas right now like we need doomsday thrillers.
teen movies have really hit the skids.
with virtually no interesting elements for an audience to focus on, chelsea walls is a triple-espresso endurance challenge.
there are simply too many ideas 

Average Metric: 91 / 100  (91.0): 100%|██████████| 100/100 [00:22<00:00,  4.40it/s]


Unnamed: 0,text,original_text,example_label,modified_label,pred_label,eval_metric
0,the affectionate quirkiness that once seemed inherent to Seewoosagur's viewpoint struggles to shine through amidst the outdated charming mystery plot and the modern Bollywood-style post-production...,the affectionate loopiness that once seemed congenital to demme's perspective has a tough time emerging from between the badly dated cutesy-pie mystery scenario and the...,0,0,0,✔️ [True]


the affectionate loopiness that once seemed congenital to demme's perspective has a tough time emerging from between the badly dated cutesy-pie mystery scenario and the newfangled hollywood post-production effects.
no sophomore slump for director sam mendes, who segues from oscar winner to oscar-winning potential with a smooth sleight of hand.
a movie with a real anarchic flair.
a compelling spanish film about the withering effects of jealousy in the life of a young monarch whose sexual passion for her husband becomes an obsession.
people cinema at its finest.
like you couldn't smell this turkey rotting from miles away.
teen movies have really hit the skids.
what the director can't do is make either of val kilmer's two personas interesting or worth caring about.
sustains its dreamlike glide through a succession of cheesy coincidences and voluptuous cheap effects, not the least of which is rebecca romijn-stamos.
that's pure pr hype.
this is a good script, good dialogue, funny even for a

Average Metric: 96 / 100  (96.0): 100%|██████████| 100/100 [00:13<00:00,  7.27it/s]


Unnamed: 0,text,original_text,example_label,modified_label,pred_label,eval_metric
0,"just one bad idea, after another.",just one bad idea after another.,0,0,0,✔️ [True]


just one bad idea after another.
a broad, melodramatic estrogen opera that's pretty toxic in its own right.
villeneuve spends too much time wallowing in bibi's generic angst (there are a lot of shots of her gazing out windows).
teen movies have really hit the skids.
(t) here's only so much anyone can do with a florid, overplotted, anne rice rock 'n' roll vampire novel before the built-in silliness of the whole affair defeats them.
nelson's brutally unsentimental approach... sucks the humanity from the film, leaving behind an horrific but weirdly unemotional spectacle.
not really bad so much as distasteful: we need kidnapping suspense dramas right now like we need doomsday thrillers.
another in-your-face wallow in the lower depths made by people who have never sung those blues.
no one but a convict guilty of some truly heinous crime should have to sit through the master of disguise.
not exactly the bees knees
corpus collosum -- while undeniably interesting -- wore out its welcome well b

Average Metric: 93 / 95  (97.9): 100%|██████████| 95/95 [00:18<00:00,  5.10it/s] 


Unnamed: 0,text,original_text,example_label,modified_label,pred_label,eval_metric
0,"(t) here's only so much anyone can do with a showy, overplotted, anne rice rock'n' roll vampire novel before the built-in silliness of the whole...","(t) here's only so much anyone can do with a florid, overplotted, anne rice rock 'n' roll vampire novel before the built-in silliness of the...",0,0,0,✔️ [True]


(t) here's only so much anyone can do with a florid, overplotted, anne rice rock 'n' roll vampire novel before the built-in silliness of the whole affair defeats them.
the film tunes into a grief that could lead a man across centuries.
it's just filler.
it feels like an after-school special gussied up with some fancy special effects, and watching its rote plot points connect is about as exciting as gazing at an egg timer for 93 minutes.
cq's reflection of artists and the love of cinema-and-self suggests nothing less than a new voice that deserves to be considered as a possible successor to the best european directors.
no screen fantasy-adventure in recent memory has the showmanship of clones ' last 45 minutes.
there is very little dread or apprehension, and though i like the creepy ideas, they are not executed with anything more than perfunctory skill.
that is a compliment to kuras and miller.
a gripping movie, played with performances that are all understated and touching.
there's... 

Average Metric: 94 / 100  (94.0): 100%|██████████| 100/100 [00:15<00:00,  6.59it/s]


Unnamed: 0,text,original_text,example_label,modified_label,pred_label,eval_metric
0,"As the two leads, Lathan and Diggs are seen as charming and are perceived to have chemistry both as friends and lovers.","as the two leads, lathan and diggs are charming and have chemistry both as friends and lovers.",1,1,1,✔️ [True]


as the two leads, lathan and diggs are charming and have chemistry both as friends and lovers.
an interesting story with a pertinent (cinematically unique) message, told fairly well and scored to perfection, i found myself struggling to put my finger on that elusive `` missing thing. ''
a literate presentation that wonderfully weaves a murderous event in 1873 with murderous rage in 2002.
at least one scene is so disgusting that viewers may be hard pressed to retain their lunch.
that is a compliment to kuras and miller.
the character of zigzag is not sufficiently developed to support a film constructed around him.
too much of it feels unfocused and underdeveloped.
thanks to scott's charismatic roger and eisenberg's sweet nephew, roger dodger is one of the most compelling variations on in the company of men.
it's of the quality of a lesser harrison ford movie - six days, seven nights, maybe, or that dreadful sabrina remake.
a sequence of ridiculous shoot - 'em - up scenes.
this illuminat

## With label change

In [27]:
def evaluate_modified_set(ds, program):
    examples = [
    dspy.Example({ 
                  "text" : remove_space(r['modified_text']), 
                  "original_text": remove_space(r['original_text']),
                  "label": int(r['modified_label']),
                  "original_label": int(r['label']),
                  "index": r['index']
                }
                  ).with_inputs("text") 
    for r in ds
    ]
    evaluate = Evaluate(devset= examples, metric=eval_metric, num_threads=6, display_progress=True, display_table=1, return_outputs= True, return_all_scores=True)
    results = evaluate(program)
    return results

In [28]:
class Sentiment(dspy.Signature):
    """Classify sentiment of the given text. Answer with 1 for positive, 0 for negative."""
    text = dspy.InputField()
    label = dspy.OutputField(prefix = 'Answer:')
class SimpleSentiment(dspy.Module):
    def __init__(self):
        super().__init__()
        self.prog = dspy.Predict(Sentiment)

    def forward(self, text):
        return self.prog(text=text)
simple_sentiment = SimpleSentiment()

In [30]:
original_pred_ds = pd.read_csv('results/sent/claude-3-5-sonnet-0shot-sst2.csv', index_col=False)
original_pred_ds['text'] = original_pred_ds['text'].apply(lambda x: remove_space(x).encode('utf-8').decode('unicode-escape'))

# Get all json files in the specified directory
json_files = glob.glob('../data/modified_data/sent/*_100.json')

for json_file in json_files:
    # Load the json file
    print(json_file)
    if not any(x in json_file for x in ['grammatical_role', 'negation']):
        continue
    with open(json_file, 'r') as f:
        data = json.load(f)
    
    results = evaluate_modified_set(data,simple_sentiment)

    
    # Convert results to dataframe
    items = []
    for sample in results[1]:
        item = {}
        sentence = sample[0]['text']
        pred = sample[1]['label']
        item['text'] = sentence
        item['modified_label'] = sample[0]['label'] 
        pred = extract_prediction(pred)
        item['modified_pred'] = pred
        # original_text = sample[0]['original_text']
        # print(original_text)
        original_text = sample[0]['original_text'].encode('utf-8').decode('unicode-escape')

        item['original_label'] = sample[0]['original_label']
        item['original_text'] = original_text
        item['original_pred'] = original_pred_ds.iloc[sample[0]['index']]['pred']
        items.append(item)
    
    df_result = pd.DataFrame(data=items)
    
    
    # Save results with filename based on input json
    output_filename = f"results/sent/claude-3-5-sonnet-0shot-{json_file.split('/')[-1].replace('.json', '')}.csv"
    df_result.to_csv(output_filename)


../data/modified_data/sent/casual_100.json
../data/modified_data/sent/discourse_100.json
../data/modified_data/sent/compound_word_100.json
../data/modified_data/sent/temporal_bias_100.json
../data/modified_data/sent/coordinating_conjunction_100.json
../data/modified_data/sent/capitalization_100.json
../data/modified_data/sent/dialectal_100.json
../data/modified_data/sent/sentiment_100.json
../data/modified_data/sent/grammatical_role_100.json


Average Metric: 61 / 66  (92.4): 100%|██████████| 66/66 [00:00<00:00, 2722.56it/s]


Unnamed: 0,text,original_text,example_label,original_label,index,pred_label,eval_metric
0,"thanks to the film's mood's absolute control by haynes, and buoyed by three terrific performances, far from heaven actually pulls off this stylistic juggling act.","thanks to haynes ' absolute control of the film's mood, and buoyed by three terrific performances, far from heaven actually pulls off this stylistic juggling...",1,1,473,1,✔️ [True]


../data/modified_data/sent/length_bias_100.json
../data/modified_data/sent/concept_replacement_100.json
../data/modified_data/sent/typo_bias_100.json
../data/modified_data/sent/geographical_bias_100.json
../data/modified_data/sent/punctuation_100.json
../data/modified_data/sent/derivation_100.json
../data/modified_data/sent/active_to_passive_100.json
../data/modified_data/sent/negation_100.json


Average Metric: 73 / 100  (73.0): 100%|██████████| 100/100 [00:45<00:00,  2.19it/s]


Unnamed: 0,text,original_text,example_label,original_label,index,pred_label,eval_metric
0,What seldom distinguishes Time of Favor from countless other thrillers is its underlying concern with the consequences of words and with the complicated emotions fueling...,what distinguishes time of favor from countless other thrillers is its underlying concern with the consequences of words and with the complicated emotions fueling terrorist...,1,1,410,1,✔️ [True]


# Aggregate results

In [4]:
from scipy import stats

In [31]:
result_files = glob.glob('results/sent/claude-3-5-sonnet-0shot-*_100.csv')

aggregated_results = []

for file in result_files:
    # Extract modification type from filename
    print(file)
    mod_type = file.split('-')[-1].replace('.csv','')
    
    # Read results file
    df = pd.read_csv(file)

    # Calculate accuracies
    original_correct = (df['original_pred'] == df['original_label']).sum()
    modified_correct = (df['modified_pred'] == df['modified_label']).sum()
    total = len(df)

    original_acc = original_correct / total
    modified_acc = modified_correct / total
    
    # Calculate the difference between original_res and modified_res
    difference = -round(original_acc - modified_acc, 2)
    
    # Calculate percentage difference with respect to total samples
    pct_difference = -round((original_correct - modified_correct) / original_correct * 100, 2)
    
    # Perform t-test between original and modified predictions
    t_stat, p_value = stats.ttest_ind(
        (df['original_pred'] == df['original_label']).astype(float),
        (df['modified_pred'] == df['modified_label']).astype(float)
    )
    
    aggregated_results.append({
        'task': 'dialogue_contradiction_detection',
        'modification': mod_type,
        'original_res': round(original_acc, 2),
        'modified_res': round(modified_acc, 2),
        'difference': difference,  # Difference in accuracy
        'pct_difference': pct_difference,  # Percentage difference relative to total samples
        'p_value': p_value  # Add p-value from t-test
    })

# Create final results dataframe
results_df = pd.DataFrame(aggregated_results)

# Sort the results based on modification_name
modification_name = ['temporal_bias_100', 'geographical_bias_100','length_bias_100', 'typo_bias_100', 'capitalization_100', 'punctuation_100', 'derivation_100', 'compound_word_100','active_to_passive_100','grammatical_role_100', 'coordinating_conjunction_100', 'concept_replacement_100','negation_100','discourse_100','sentiment_100','casual_100', 'dialectal_100']
results_df['modification'] = pd.Categorical(results_df['modification'], categories=modification_name, ordered=True)
results_df = results_df.sort_values(by='modification')

# Calculate averages across all modifications
avg_original = results_df['original_res'].mean()
avg_modified = results_df['modified_res'].mean()
avg_difference = avg_original - avg_modified
avg_pct_difference = results_df['pct_difference'].mean()

# Add averages as a new row
results_df.loc[len(results_df)] = {
    'task': 'dialogue_contradiction_detection',
    'modification': 'average',
    'original_res': round(avg_original, 2),
    'modified_res': round(avg_modified, 2),
    'difference': -round(avg_difference, 2),
    'pct_difference': round(avg_pct_difference, 2),
    'p_value': None  # No p-value for average row
}

print("\n")
results_df.to_csv('results/sent/claude-3-5-sonnet-DP.csv')

# Apply styling to highlight rows where original_res > modified_res and significant p-values
def highlight_drops_and_significance(row):
    colors = [''] * len(row)
    if row['original_res'] > row['modified_res']:
        colors = ['background-color: red'] * len(row)
        # If p-value < 0.05, add bold text
        if 'p_value' in row and row['p_value'] is not None and row['p_value'] < 0.05:
            colors = ['background-color: red; font-weight: bold'] * len(row)
    return colors

results_df.round(2).style.apply(highlight_drops_and_significance, axis=1)


results/sent/claude-3-5-sonnet-0shot-geographical_bias_100.csv
results/sent/claude-3-5-sonnet-0shot-discourse_100.csv
results/sent/claude-3-5-sonnet-0shot-concept_replacement_100.csv
results/sent/claude-3-5-sonnet-0shot-active_to_passive_100.csv
results/sent/claude-3-5-sonnet-0shot-capitalization_100.csv
results/sent/claude-3-5-sonnet-0shot-negation_100.csv
results/sent/claude-3-5-sonnet-0shot-grammatical_role_100.csv
results/sent/claude-3-5-sonnet-0shot-punctuation_100.csv
results/sent/claude-3-5-sonnet-0shot-length_bias_100.csv
results/sent/claude-3-5-sonnet-0shot-temporal_bias_100.csv
results/sent/claude-3-5-sonnet-0shot-typo_bias_100.csv
results/sent/claude-3-5-sonnet-0shot-dialectal_100.csv
results/sent/claude-3-5-sonnet-0shot-sentiment_100.csv
results/sent/claude-3-5-sonnet-0shot-coordinating_conjunction_100.csv
results/sent/claude-3-5-sonnet-0shot-compound_word_100.csv
results/sent/claude-3-5-sonnet-0shot-casual_100.csv
results/sent/claude-3-5-sonnet-0shot-derivation_100.csv




  results_df.loc[len(results_df)] = {


Unnamed: 0,task,modification,original_res,modified_res,difference,pct_difference,p_value
9,dialogue_contradiction_detection,temporal_bias_100,0.9,0.94,0.04,4.44,0.3
0,dialogue_contradiction_detection,geographical_bias_100,0.9,0.92,0.02,2.22,0.62
8,dialogue_contradiction_detection,length_bias_100,0.91,0.94,0.03,3.3,0.42
10,dialogue_contradiction_detection,typo_bias_100,0.91,0.95,0.04,4.4,0.27
4,dialogue_contradiction_detection,capitalization_100,0.91,0.96,0.05,5.56,0.15
7,dialogue_contradiction_detection,punctuation_100,0.92,0.93,0.01,1.09,0.79
16,dialogue_contradiction_detection,derivation_100,0.91,0.95,0.04,4.65,0.27
14,dialogue_contradiction_detection,compound_word_100,0.94,0.95,0.01,1.12,0.76
3,dialogue_contradiction_detection,active_to_passive_100,0.9,0.88,-0.02,-2.22,0.65
6,dialogue_contradiction_detection,grammatical_role_100,0.94,0.92,-0.02,-1.61,0.73


In [205]:
# Load results from different models
gpt4_df = pd.read_csv('results/sent/claude-3-5-sonnet-0shot-sst2.csv')
claude_df = pd.read_csv('results/sent/claude-3-5-sonnet-0shot-sst2.csv')
mixtral_df = pd.read_csv('results/sent/mixtral-8x22b-sst2.csv')

# Calculate accuracy between predictions and labels
gpt4_acc = (gpt4_df['pred'] == gpt4_df['label']).mean()
claude_acc = (claude_df['pred'] == claude_df['label']).mean()
mixtral_acc = (mixtral_df['pred'] == mixtral_df['label']).mean()
# Calculate average accuracy for each model
print(f"GPT-4 Average Accuracy: {gpt4_acc:.2%}")
print(f"Claude-3.5 Average Accuracy: {claude_acc:.2%}")
print(f"Mixtral Average Accuracy: {mixtral_acc:.2%}")

# Create comparison dataframe
comparison_df = pd.DataFrame({
    'Model': ['GPT-4', 'Claude-3.5', 'Mixtral'],
    'Accuracy': [gpt4_acc, claude_acc, mixtral_acc]
})

# Style the dataframe
def highlight_max(s):
    is_max = s == s.max()
    return ['background-color: green' if v else '' for v in is_max]

styled_df = comparison_df.style.apply(highlight_max, subset=['Accuracy'])
styled_df


GPT-4 Average Accuracy: 93.69%
Claude-3.5 Average Accuracy: 91.51%
Mixtral Average Accuracy: 83.37%


Unnamed: 0,Model,Accuracy
0,GPT-4,0.936927
1,Claude-3.5,0.915138
2,Mixtral,0.833716
