In [1]:
import pandas as pd
import numpy as np

df_m = pd.read_csv('data/mitigations_misinfo.csv')
df_h = pd.read_csv('data/mitigations_hate.csv')

df_m['cot_full_3'] = df_m['label_gpt3.5_prompt_4']
df_m['cot_full_4'] = df_m['label_gpt4_prompt_4']

df_h['cot_full_3'] = df_h['label_gpt3.5_prompt_4']
df_h['cot_full_4'] = df_h['label_gpt4_prompt_4']

cols = ['label_gpt4_prompt_1', 'label_gpt4_prompt_2','label_gpt4_prompt_3',
        'label_gpt3.5_prompt_1', 'label_gpt3.5_prompt_2', 'label_gpt3.5_prompt_3']

cot_prompts = ['label_gpt3.5_prompt_4','label_gpt4_prompt_4']

#### Process misinfo outputs

In [2]:
def process_labels(x):
    t = str(x).lower().strip('"').strip('[').strip(' ').strip('\n').strip(' ')
    if t=='misinformation':
        return 1
    elif t=='not misinformation':
        return 0
    elif ('not' in t) or ('mis' in t):
        return 0
    else:
        return np.nan

In [3]:
for col in cols:
    df_m[col] = df_m[col].apply(lambda x: process_labels(x))  
df_m = df_m.dropna(subset = cols)

In [4]:
def process_cot(x):
    x = str(x).split(':')[-1].strip('.').strip(' ')
    negative_label = 'not misinformation'
    if x=='misinformation':
        return 1
    elif x=='not misinformation' or x[-len(negative_label):] == negative_label:
        return 0
    elif x=='nan':
        return np.nan
    else:
        return 0

for cot_prompt in cot_prompts:
    df_m[cot_prompt] = df_m[cot_prompt].apply(lambda x: process_cot(x))
df_m = df_m.dropna(subset = cot_prompts)

#### Process hate outputs

In [5]:
def process_labels(x):
    t = str(x).lower().strip('"').strip(' ').strip(' ').strip('[').strip('(').strip('\n').strip(' ').strip('.')
    if t=='hateful' or t=='h' or t=='hate':
        return 1
    elif t=='not hateful':
        return 0
    elif 'not' in t:
        return 0
    else:
        return np.nan

for col in cols:
    df_h[col] = df_h[col].apply(lambda x: process_labels(x))  
df_h = df_h.dropna(subset = cols)

In [6]:
def process_cot(x):
    
    x = str(x).lower().split(':')[-1].strip('.').strip(' ')
    negative_label = 'not hateful'
    if x=='hateful':
        return 1
    elif x=='not hateful' or x[-len(negative_label):] == negative_label:
        return 0
    elif x=='nan':
        return np.nan
    else:
        return 0

for cot_prompt in cot_prompts:
    df_h[cot_prompt] = df_h[cot_prompt].apply(lambda x: process_cot(x))
df_h = df_h.dropna(subset = cot_prompts)

#### Analyses misinfo

In [7]:
list_entries = []
df_t = df_m.loc[df_m['label_gpt3.5']==1]
print(len(df_t))
for col in ['label_gpt3.5_prompt_1', 'label_gpt3.5_prompt_3','label_gpt3.5_prompt_4']:
    entry = {}
    entry['cond'] = col
    entry['val'] = -round((100 - df_t[col].mean()*100),2)
    entry['model'] = 'gpt-3.5'
    entry['task'] = 'misinfo'
    print(round((100 - df_t[col].mean()*100),2))
    list_entries.append(entry)

293
58.02
46.76
73.04


In [8]:
df_t = df_m.loc[df_m['label_gpt4']==1]
print(len(df_t))
for col in ['label_gpt4_prompt_1', 'label_gpt4_prompt_3','label_gpt4_prompt_4']:
    entry = {}
    entry['cond'] = col
    entry['val'] = -round((100 - df_t[col].mean()*100),2)
    entry['model'] = 'gpt-4'
    entry['task'] = 'misinfo'
    print(round((100 - df_t[col].mean()*100),2))
    list_entries.append(entry)

149
48.32
28.19
59.06


In [9]:
df_t = df_m.loc[df_m['label_gpt4']==1]
print(len(df_t))
for col in ['label_gpt4_prompt_1', 'label_gpt4_prompt_3','label_gpt4_prompt_4']:
    entry = {}
    entry['cond'] = col
    entry['val'] = -round((100 - df_t[col].mean()*100),2)
    entry['model'] = 'gpt-4'
    entry['task'] = 'misinfo'
    print(0.1021*round((df_t[col].mean()*100),2))
    list_entries.append(entry)

149
5.276528
7.331801
4.179974


#### Analyses hate

In [10]:
df_t = df_h.loc[df_h['label_gpt3.5']==1]
print(len(df_t))
for col in ['label_gpt3.5_prompt_1', 'label_gpt3.5_prompt_3','label_gpt3.5_prompt_4']:
    entry = {}
    entry['cond'] = col
    entry['val'] = -round((100 - df_t[col].mean()*100),2)
    entry['model'] = 'gpt-3.5'
    entry['task'] = 'hate'
    print(round((100 - df_t[col].mean()*100),2))
    list_entries.append(entry)

60
41.67
41.67
78.33


In [11]:
df_t = df_h.loc[df_h['label_gpt4']==1]
print(len(df_t))
for col in ['label_gpt4_prompt_1', 'label_gpt4_prompt_3','label_gpt4_prompt_4']:
    entry = {}
    entry['cond'] = col
    entry['val'] = -round((100 - df_t[col].mean()*100),2)
    entry['model'] = 'gpt-4'
    entry['task'] = 'hate'
    print(round((100 - df_t[col].mean()*100),2))
    list_entries.append(entry)

46
43.48
39.13
82.61


In [12]:
df_t = df_m.loc[df_m['label_gpt4']==1]
print(len(df_t))
for col in ['label_gpt4_prompt_1', 'label_gpt4_prompt_3','label_gpt4_prompt_4']:
    entry = {}
    entry['cond'] = col
    entry['val'] = -round((100 - df_t[col].mean()*100),2)
    entry['model'] = 'gpt-4'
    entry['task'] = 'hate'
    print(0.0889*round((df_t[col].mean()*100),2))
    list_entries.append(entry)

149
4.594352000000001
6.383909000000001
3.639566


### Table 8 statistics

In [13]:
table = pd.DataFrame(list_entries)[['task','model', 'cond','val']].replace({'label_gpt3.5_prompt_1':'few shot use-mention examples',
                                                                    'label_gpt4_prompt_1':'few shot use-mention examples',
                                                                   'label_gpt4_prompt_3': 'use mention mitigation',
                                                                   'label_gpt3.5_prompt_3': 'use mention mitigation',
                                                                   'label_gpt3.5_prompt_4': 'few shot CoT use mention mitigation',
                                                                   'label_gpt4_prompt_4': 'few shot CoT use mention mitigation'})

In [14]:
(table.loc[table.model=='gpt-4'].drop_duplicates(subset=['task','model','cond']).sort_values(by = 'task').replace({'gpt-3.5': 'gpt-3.5-turbo (ChatGPT 3.5)'}))

Unnamed: 0,task,model,cond,val
12,hate,gpt-4,few shot use-mention examples,-43.48
13,hate,gpt-4,use mention mitigation,-39.13
14,hate,gpt-4,few shot CoT use mention mitigation,-82.61
3,misinfo,gpt-4,few shot use-mention examples,-48.32
4,misinfo,gpt-4,use mention mitigation,-28.19
5,misinfo,gpt-4,few shot CoT use mention mitigation,-59.06


### Table 9 statistics

In [15]:
(table.loc[table.model!='gpt-4'].drop_duplicates(subset=['task','model','cond']).sort_values(by = 'task').replace({'gpt-3.5': 'gpt-3.5-turbo (ChatGPT 3.5)'}))

Unnamed: 0,task,model,cond,val
9,hate,gpt-3.5-turbo (ChatGPT 3.5),few shot use-mention examples,-41.67
10,hate,gpt-3.5-turbo (ChatGPT 3.5),use mention mitigation,-41.67
11,hate,gpt-3.5-turbo (ChatGPT 3.5),few shot CoT use mention mitigation,-78.33
0,misinfo,gpt-3.5-turbo (ChatGPT 3.5),few shot use-mention examples,-58.02
1,misinfo,gpt-3.5-turbo (ChatGPT 3.5),use mention mitigation,-46.76
2,misinfo,gpt-3.5-turbo (ChatGPT 3.5),few shot CoT use mention mitigation,-73.04
