In [None]:
import pandas as pd
df = pd.read_csv('../data/wq_nli_results/BBU_tw_wqnli_results.csv')

In [None]:
# Calculate the metrics
results = {}

# 1. % cases where p(entail | stereo) > p(entail | counter)
entail_stereo_gt_counter = (df['p(entail|stereo)'] > df['p(ent|counter)']).mean() * 100
results['p(entail|stereo) > p(entail|counter)'] = entail_stereo_gt_counter

# 2a. % cases where p(entail | stereo) >= .5
entail_stereo_gte_half = (df['p(entail|stereo)'] >= 0.5).mean() * 100
results['p(entail|stereo) >= 0.5'] = entail_stereo_gte_half
# 2b. % cases where p(entail | stereo) >= .5
entail_stereo_gte_half = (df['p(entail|stereo)'] >= 0.25).mean() * 100
results['p(entail|stereo) >= 0.25'] = entail_stereo_gte_half
# 2c. % cases where p(entail | stereo) >= .5
entail_stereo_gte_half = (df['p(entail|stereo)'] >= 0.75).mean() * 100
results['p(entail|stereo) >= 0.75'] = entail_stereo_gte_half

# 3. % cases where [p(entail | stereo) - p(contradict | stereo)] > [p(entail | counter) - p(contradict | counter)]
stereo_diff = df['p(entail|stereo)'] - df['p(contradict|stereo)']
counter_diff = df['p(ent|counter)'] - df['p(contra|counter)']
entail_minus_contradict_diff = (stereo_diff > counter_diff).mean() * 100
results['entail-contradict difference comparison'] = entail_minus_contradict_diff

# 4. % cases where [p(entail | stereo) - ½ p(neutral | stereo) - p(contradict | stereo)] >
#                  [p(entail | counter) -½ p(neutral | counter) - p(contradict | counter)]
stereo_complex = df['p(entail|stereo)'] - 0.5 * df['p(neutral|stereo)'] - df['p(contradict|stereo)']
counter_complex = df['p(ent|counter)'] - 0.5 * df['p(neut|counter)'] - df['p(contra|counter)']
complex_diff = (stereo_complex > counter_complex).mean() * 100
results['complex metric comparison'] = complex_diff

# Display results in a formatted table
for metric, value in results.items():
    print(f"{metric}: {value:.2f}%")

p(entail|stereo) > p(entail|counter): 51.43%
p(entail|stereo) >= 0.5: 9.84%
p(entail|stereo) >= 0.25: 16.43%
p(entail|stereo) >= 0.75: 4.52%
entail-contradict difference comparison: 55.25%
complex metric comparison: 54.21%


In [68]:
# Define the standard column groups
stereo_cols = ['p(entail|stereo)', 'p(neutral|stereo)', 'p(contradict|stereo)']
counter_cols = ['p(ent|counter)', 'p(neut|counter)', 'p(contra|counter)']

# Find the most likely label for each row
df['most_likely_stereo'] = df[stereo_cols].idxmax(axis=1)
df['most_likely_counter'] = df[counter_cols].idxmax(axis=1)

# Calculate percentages for the two specific patterns
results = {}

# Pattern 1: Entail for stereo, Neutral for counter
entail_neutral = ((df['most_likely_stereo'] == 'p(entail|stereo)') &
                     (df['most_likely_counter'] == 'p(neut|counter)'))
results['entail_stereo_neutral_counter'] = round(entail_neutral.mean() * 100, 2)

# Pattern 2: Entail for stereo, Contradict for counter
entail_contra = ((df['most_likely_stereo'] == 'p(entail|stereo)') &
                    (df['most_likely_counter'] == 'p(contra|counter)'))
results['entail_stereo_contra_counter'] = round(entail_contra.mean() * 100, 2)

results

{'entail_stereo_neutral_counter': 1.26, 'entail_stereo_contra_counter': 7.0}