# Compute consistency scores of TruthfulQA

**Consistency Scores**
All of these are done on greedy and sampled

* Compute consistency of questions
* Compute answer consistency on all per model 'facebook_opt-125m', 'facebook_opt-350m', 'facebook_opt-1.3b', 'facebook_opt-2.7b' with average answer accuracy 
* Compute answer consistency per model human filtered only
* Compute consistency of accuracate quetsions only (human v non human filtered)

**Correlation scores**
* Compute correlation of answer consistency scores with averaege answer accuracy
* Compute correlation of question consistency scores with average answer accuracy
* Compute correlation of question consistency with answer consistency

**Addtional**
* Analysis of consistent and inconsistent questions
* Analysis of human filtered (what was removed and how much)

```py
scale_options = ['facebook_opt-125m', 'facebook_opt-350m', 'facebook_opt-1.3b', 'facebook_opt-2.7b']
decoder_options = ['greedy_answers', 'sampled_answers']
human_filtered = ['true', 'not_true']
```

In [1]:
import pandas as pd
import sys
sys.path.append("..") 
from utils import ConsistencyScoring
from tqdm.auto import tqdm

In [2]:
consistency_scorer = ConsistencyScoring()

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


## Consistency of questions together and by paraphrase type

**ToDo**: Run on human filtered and actually answered

In [3]:
questions = pd.read_csv('../data/truthful_qa_top_6_by_pp.csv')
len(questions)

4493

In [4]:
# TODO add human filtered analysis here!
split_two = pd.read_csv('../data/truthful_qa_top_6_by_pp_split_2_manually_filtered.txt')
split_two = split_two[split_two['keep'] != False]
split_one = pd.read_csv('../data/truthful_qa_top_6_by_pp_split_1_manually_filtered.csv')
human_filtered = pd.concat([split_one, split_two])
len(human_filtered)

3706

In [5]:
# add original_question as a question to ask
original_questions = pd.DataFrame([
    {'dataset': 'truthful_qa',
     'passage': None,
     'original question': question,
     'paraphrased question': question,
     'paraphrased generaton model': 'original',
     'pp_score': 1.0} for question in set(questions['original question'])
])
len(original_questions)

805

In [6]:
df = pd.concat([original_questions, questions])
human_filtered_df = pd.concat([human_filtered, original_questions])

In [7]:
df['paraphrased generaton model'].unique()

array(['original', 'T5-finetuned', 'Prompt-text-davinci-002', 'QC'],
      dtype=object)

In [8]:
df.groupby('paraphrased generaton model').count()['dataset']

paraphrased generaton model
Prompt-text-davinci-002     392
QC                         1410
T5-finetuned               2691
original                    805
Name: dataset, dtype: int64

In [9]:
# question_consistency = {}
# for q in tqdm(df['original question'].unique()):
#     questions = df[df['original question'] == q]['paraphrased question']
#     question_consistency[q] = consistency_scorer.get_score(questions)
# q_consistency_df = pd.DataFrame([{'question': q, **v} for q, v in list(question_consistency.items())])
# q_consistency_df.to_csv('../data/question_consistency_scores_all.csv', index=None)

In [10]:
# for pp_type in ['T5-finetuned', 'Prompt-text-davinci-002', 'QC']:
#     question_consistency = {}
#     for q in tqdm(df['original question'].unique()):
#         q_df = df[df['paraphrased generaton model'].isin([pp_type, 'original'])]
#         questions = q_df[q_df['original question'] == q]['paraphrased question']
#         if len(questions) < 2:
#             continue
#         question_consistency[q] = consistency_scorer.get_score(questions)
#     pd.DataFrame([{'question': q, **v} for q, v in list(question_consistency.items())]).to_csv(f'../data/question_consistency_scores_{pp_type}.csv', index=None)

In [11]:
# question_consistency = {}
# for q in tqdm(human_filtered_df['original question'].unique()):
#     questions = human_filtered_df[human_filtered_df['original question'] == q]['paraphrased question']
#     if len(questions) == 0:
#         continue
#     question_consistency[q] = consistency_scorer.get_score(questions)
# q_consistency_df = pd.DataFrame(
#     [{'question': q, **v} for q, v in list(question_consistency.items())]
# )
# q_consistency_df.to_csv('../data/question_consistency_scores_human_filtered.csv', index=None)

## Consistency of answers

* answer consistency on all per model 'facebook_opt-125m', 'facebook_opt-350m', 'facebook_opt-1.3b', 'facebook_opt-2.7b' with average answer accuracy
* Compute answer consistency per model human filtered only

In [None]:
scale_options = ['facebook_opt-125m', 'facebook_opt-350m', 'facebook_opt-1.3b', 'facebook_opt-2.7b']
decoder_options = ['greedy_answers', 'sampled_answers']
human_filtered = ['true', 'not_true']

for model in scale_options:
    print('Computing consistency for ', model)
    answers_df = pd.read_csv(f'../data/{model}_acc_scores.csv').fillna('')
    answers_df =  answers_df[answers_df['paraphrased question'].isin(
        human_filtered_df['paraphrased question'].unique()
    )]
    answer_consistency = {}
    for q in tqdm(answers_df['original question'].unique()):
        answers = answers_df[answers_df['original question'] == q]
        answer_strings = [ans for ans in answers['greedy_answers'] if ans]
        if len(answer_strings) < 2:
            continue
        answer_consistency[q] = consistency_scorer.get_score(answer_strings)
        acc = [ac for ac in answers['greedy_answers bleu acc'] if ac != '']
        answer_consistency[q]['avg_acc'] = sum(acc) / len(acc)
    pd.DataFrame([{'question': q, **v} for q, v in list(answer_consistency.items())]).to_csv(f'../data/nli/{model}_answer_consistency_greedy_human_filtered.csv', index=None)
    
    answer_consistency = {}
    for q in tqdm(answers_df['original question'].unique()):
        answers = answers_df[answers_df['original question'] == q]
        answers = answers[answers['greedy_answers bleu acc'] == 1]
        answer_strings = [ans for ans in answers['greedy_answers'] if ans]
        if len(answer_strings) < 2:
            continue
        answer_consistency[q] = consistency_scorer.get_score(answer_strings)
        answer_consistency[q]['avg_acc'] = 1.0
    pd.DataFrame([{'question': q, **v} for q, v in list(answer_consistency.items())]).to_csv(f'../data/nli/{model}_answer_consistency_greedy_and_accurate_human_filtered.csv', index=None)
    
    answer_consistency = {}
    for q in tqdm(answers_df['original question'].unique()):
        answers = answers_df[answers_df['original question'] == q]
        answer_strings = [ans for ans in answers['sampled_answers'] if ans]
        if len(answer_strings) < 2:
            continue
        answer_consistency[q] = consistency_scorer.get_score(answer_strings)
        acc = [ac for ac in answers['sampled_answers bleu acc'] if ac != '']
        answer_consistency[q]['avg_acc'] = sum(acc) / len(acc)
    pd.DataFrame([{'question': q, **v} for q, v in list(answer_consistency.items())]).to_csv(f'../data/nli/{model}_answer_consistency_sampled_human_filtered.csv', index=None)
    
    answer_consistency = {}
    for q in tqdm(answers_df['original question'].unique()):
        answers = answers_df[answers_df['original question'] == q]
        answers = answers[answers['sampled_answers bleu acc'] == 1]
        answer_strings = [ans for ans in answers['sampled_answers'] if ans]
        if len(answer_strings) < 2:
            continue
        answer_consistency[q] = consistency_scorer.get_score(answer_strings)
        answer_consistency[q]['avg_acc'] = 1.0
    pd.DataFrame([{'question': q, **v} for q, v in list(answer_consistency.items())]).to_csv(f'../data/nli/{model}_answer_consistency_sampled_and_accurate_human_filtered.csv', index=None)

Computing consistency for  facebook_opt-125m


  0%|          | 0/804 [00:00<?, ?it/s]

In [None]:
scale_options = ['facebook_opt-125m', 'facebook_opt-350m', 'facebook_opt-1.3b', 'facebook_opt-2.7b']
decoder_options = ['greedy_answers', 'sampled_answers']
human_filtered = ['true', 'not_true']

for model in scale_options:
    print('Computing consistency for ', model)
    answers_df = pd.read_csv(f'../data/{model}_acc_scores.csv').fillna('')
    answer_consistency = {}
    for q in tqdm(answers_df['original question'].unique()):
        answers = answers_df[answers_df['original question'] == q]
        answer_strings = [ans for ans in answers['greedy_answers'] if ans]
        if len(answer_strings) < 2:
            continue
        answer_consistency[q] = consistency_scorer.get_score(answer_strings)
        acc = [ac for ac in answers['greedy_answers bleu acc'] if ac != '']
        answer_consistency[q]['avg_acc'] = sum(acc) / len(acc)
    pd.DataFrame([{'question': q, **v} for q, v in list(answer_consistency.items())]).to_csv(f'../data/nli/{model}_answer_consistency_greedy.csv', index=None)
    
    answer_consistency = {}
    for q in tqdm(answers_df['original question'].unique()):
        answers = answers_df[answers_df['original question'] == q]
        answers = answers[answers['greedy_answers bleu acc'] == 1]
        answer_strings = [ans for ans in answers['greedy_answers'] if ans]
        if len(answer_strings) < 2:
            continue
        answer_consistency[q] = consistency_scorer.get_score(answer_strings)
        answer_consistency[q]['avg_acc'] = 1.0
    pd.DataFrame([{'question': q, **v} for q, v in list(answer_consistency.items())]).to_csv(f'../data/nli/{model}_answer_consistency_greedy_and_accurate.csv', index=None)
    
    answer_consistency = {}
    for q in tqdm(answers_df['original question'].unique()):
        answers = answers_df[answers_df['original question'] == q]
        answer_strings = [ans for ans in answers['sampled_answers'] if ans]
        if len(answer_strings) < 2:
            continue
        answer_consistency[q] = consistency_scorer.get_score(answer_strings)
        acc = [ac for ac in answers['sampled_answers bleu acc'] if ac != '']
        answer_consistency[q]['avg_acc'] = sum(acc) / len(acc)
    pd.DataFrame([{'question': q, **v} for q, v in list(answer_consistency.items())]).to_csv(f'../data/nli/{model}_answer_consistency_sampled.csv', index=None)
    
    answer_consistency = {}
    for q in tqdm(answers_df['original question'].unique()):
        answers = answers_df[answers_df['original question'] == q]
        answers = answers[answers['sampled_answers bleu acc'] == 1]
        answer_strings = [ans for ans in answers['sampled_answers'] if ans]
        if len(answer_strings) < 2:
            continue
        answer_consistency[q] = consistency_scorer.get_score(answer_strings)
        answer_consistency[q]['avg_acc'] = 1.0
    pd.DataFrame([{'question': q, **v} for q, v in list(answer_consistency.items())]).to_csv(f'../data/nli/{model}_answer_consistency_sampled_and_accurate.csv', index=None)