# Pipelines

In [None]:
import json
import matplotlib.pyplot as plt
import datasets
import pandas as pd
from scipy.stats import shapiro, kruskal, spearmanr, wilcoxon
from cliffs_delta import cliffs_delta

In [None]:
def load_pipeline_analysis_data(name):
    """
    Load the analysis data for a pipeline
    
    :param name: the name of the pipeline
    :return: a pandas DataFrame with the analysis data
    """
    file_name = f'pipeline_results/analysis_{name}_result.json'
    with open(file_name, 'r') as f:
        data = json.load(f)
    df = pd.DataFrame(data)
    return df

def load_pipeline_data(name):
    """
    Load the generated data for a pipeline
    
    :param name: the name of the pipeline
    :return: a pandas DataFrame with the generated data
    """
    file_name = f'pipeline_results/{name}_result.json'
    with open(file_name, 'r') as f:
        data = json.load(f)
    df = pd.DataFrame(data)
    return df

pipelines_metrics = {
    'pipeline1': load_pipeline_analysis_data('pipeline1'),
    'pipeline2': load_pipeline_analysis_data('pipeline2'),
    'pipeline3': load_pipeline_analysis_data('pipeline3')
}

pipelines = {
    'pipeline1': load_pipeline_data('pipeline1'),
    'pipeline2': load_pipeline_data('pipeline2'),
    'pipeline3': load_pipeline_data('pipeline3')
}

## Creates a sample for manual analysis
The sample contains 40 questions from the test set and the generated questions and distractors from pipeline1.

In [None]:
pipeline1 = load_pipeline_data('pipeline1')
pipeline1.dropna(subset=['generated_question', 'generated_distractors', 'generated_distractors'], inplace=True)
pipeline1.drop(columns=['correct_answer', 'support','time'], axis=1, inplace=True)
pipeline1_sample = pipeline1.sample(40, random_state=42)

pipeline2 = load_pipeline_data('pipeline2')
pipeline2.dropna(subset=['generated_question', 'generated_distractors', 'generated_distractors'], inplace=True)
pipeline2.drop(columns=['correct_answer', 'support','time'], axis=1, inplace=True)
pipeline2_sample = pipeline2.sample(40, random_state=42)

test_data = datasets.load_dataset('allenai/sciq', split='test').to_pandas()
test_data_sample = test_data.loc[pipeline1_sample.index.astype('int32')]
test_data_sample.reset_index(drop=True, inplace=True)
pipeline1_sample.reset_index(drop=True, inplace=True)
manual_sample_1 = pd.concat([test_data_sample, pipeline1_sample], axis=1)
manual_sample_1.to_csv('manual_analysis_1.csv', columns=['question', 'generated_question', 'correct_answer', 'distractor1', 'distractor2', 'distractor3', 'generated_distractors', 'support'])
manual_sample_2 = pd.concat([test_data_sample, pipeline2_sample], axis=1)
manual_sample_2.to_csv('manual_analysis_2.csv', columns=['question', 'generated_question', 'correct_answer', 'distractor1', 'distractor2', 'distractor3', 'generated_distractors', 'support'])


# Evaluate manual scoring

In [None]:
manual_pipe1 = pd.read_csv('pipeline1_manual.csv')
manual_pipe2 = pd.read_csv('pipeline2_manual.csv')
for column in manual_pipe1.columns[1:]:
    manual_pipe1[column] = (manual_pipe1[column] - manual_pipe1[column].mean()) / manual_pipe1[column].std() if manual_pipe1[column].std() != 0 else 0
    manual_pipe2[column] = (manual_pipe2[column] - manual_pipe2[column].mean()) / manual_pipe2[column].std() if manual_pipe2[column].std() != 0 else 0
    
for column in manual_pipe1.columns[1:]:
    stat1, p1 = shapiro(manual_pipe1[column])
    stat2, p2 = shapiro(manual_pipe2[column])
    res = wilcoxon(manual_pipe1[column], manual_pipe2[column], zero_method='zsplit')
    print(f'Wilcoxon test for guideline {column}: stat={res.statistic}, p={res.pvalue}')

# Statistical analysis of metrics for QG

In [None]:
pipeline_q_metrics = ['bleu', 'rouge1', 'rouge2', 'rougeL', 'bleurt', 'bertscore']
pipeline_a_metrics = ['bleu_ans', 'rouge1_ans', 'rouge2_ans', 'rougeL_ans', 'bertscore_ans']
print('Shapiro-Wilk test for normality')
for pipeline, df in pipelines_metrics.items():
    for metric in pipeline_q_metrics:
        stat, p = shapiro(df[metric].dropna())
        print(f'{pipeline} {metric} p={p}')
print('Kruskal-Wallis H-test for equal medians')
for metric in pipeline_q_metrics:
    stat, p = kruskal(
        pipelines_metrics['pipeline1'][metric].dropna(),
        pipelines_metrics['pipeline2'][metric].dropna(),
        pipelines_metrics['pipeline3'][metric].dropna(),
    )
    print(f'{metric}: stat={stat}, p={p}')
    
print('Cliff\'s delta for effect size')
for metric in pipeline_q_metrics:
    for pipeline1, pipeline2 in [('pipeline1', 'pipeline2'), ('pipeline1', 'pipeline3'), ('pipeline2', 'pipeline3')]:
        delta = cliffs_delta(pipelines_metrics[pipeline1][metric].dropna(), pipelines_metrics[pipeline2][metric].dropna())
        print(f'{pipeline1} vs {pipeline2} {metric}: {delta}')

# Analysis of answer metrics

In [None]:
plt.figure()
plt.boxplot(pipelines_metrics['pipeline3']['bleu_ans'].dropna(), positions=[1], widths=0.6)
plt.boxplot(pipelines_metrics['pipeline3']['rouge1_ans'].dropna(), positions=[2], widths=0.6)
plt.boxplot(pipelines_metrics['pipeline3']['rouge2_ans'].dropna(), positions=[3], widths=0.6)
plt.boxplot(pipelines_metrics['pipeline3']['rougeL_ans'].dropna(), positions=[4], widths=0.6)
plt.boxplot(pipelines_metrics['pipeline3']['bertscore_ans'].dropna(), positions=[5], widths=0.6)
plt.xticks([1, 2, 3, 4, 5], ['bleu', 'rouge1', 'rouge2', 'rougeL', 'bertscore'])
plt.title('Pipeline3 answer metrics')
plt.ylabel('Score')
plt.show()

# Analysis of distractors

In [None]:
plt.figure()
plt.boxplot(pipelines_metrics['pipeline1']['max_bleurt'].dropna(), positions=[1], widths=0.6)
plt.boxplot(pipelines_metrics['pipeline2']['max_bleurt'].dropna(), positions=[2], widths=0.6)
plt.boxplot(pipelines_metrics['pipeline3']['max_bleurt'].dropna(), positions=[3], widths=0.6)
plt.xticks([1, 2, 3], ['pipeline1', 'pipeline2', 'pipeline3'])
plt.ylabel('Score')
plt.title('Distractors bleurt score')
plt.show()

In [None]:
print('Shapiro-Wilk test for normality')
for pipeline, df in pipelines_metrics.items():
    stat, p = shapiro(df['max_bleurt'].dropna())
    print(f'{pipeline} max_bleurt p={p}')
print('Kruskal-Wallis H-test for equal medians')
stat, p = kruskal(
    pipelines_metrics['pipeline1']['max_bleurt'].dropna(),
    pipelines_metrics['pipeline2']['max_bleurt'].dropna()
)
print(f'Kruskal-Wallis H-test for equal medians: stat={stat}, p={p}')
stat, p = kruskal(
    pipelines_metrics['pipeline1']['max_bleurt'].dropna(),
    pipelines_metrics['pipeline3']['max_bleurt'].dropna()
)
print(f'Kruskal-Wallis H-test for equal medians: stat={stat}, p={p}')
stat, p = kruskal(
    pipelines_metrics['pipeline2']['max_bleurt'].dropna(),
    pipelines_metrics['pipeline3']['max_bleurt'].dropna()
)
print(f'Kruskal-Wallis H-test for equal medians: stat={stat}, p={p}')
print('Cliff\'s delta for effect size')
for pipeline1, pipeline2 in [('pipeline1', 'pipeline2'), ('pipeline1', 'pipeline3'), ('pipeline2', 'pipeline3')]:
    delta = cliffs_delta(pipelines_metrics[pipeline1]['max_bleurt'].dropna(), pipelines_metrics[pipeline2]['max_bleurt'].dropna())
    print(f'{pipeline1} vs {pipeline2}: {delta}')


In [None]:
print('Spearmann correlation between time and bleurt')
for name, pipeline in pipelines.items():
    time_non_null = pipeline['time'].dropna()
    bleurt_non_null = pipelines_metrics[name]['max_bleurt'].dropna()
    corr, p = spearmanr(bleurt_non_null, time_non_null)
    print(f'{name} corr={corr}, p={p}')

# Statistical analysis of time

In [None]:
print('Shapiro-Wilk test for normality')
for pipeline, df in pipelines.items():
    stat, p = shapiro(df['time'].dropna())
    print(f'{pipeline} time p={p}')
print('Kruskal-Wallis H-test for equal medians')
stat, p = kruskal(
    pipelines['pipeline1']['time'].dropna(),
    pipelines['pipeline2']['time'].dropna(),
    pipelines['pipeline3']['time'].dropna(),
)
print(f'Kruskal-Wallis H-test for equal medians: stat={stat}, p={p}')
print('Cliff\'s delta for effect size')
for pipeline1, pipeline2 in [('pipeline1', 'pipeline2'), ('pipeline1', 'pipeline3'), ('pipeline2', 'pipeline3')]:
    delta = cliffs_delta(pipelines[pipeline1]['time'].dropna(), pipelines[pipeline2]['time'].dropna())
    print(f'{pipeline1} vs {pipeline2}: {delta}')
    
print('Medians')
print('pipeline1 median: ', pipelines['pipeline1']['time'].median())
print('pipeline2 median: ', pipelines['pipeline2']['time'].median())
print('pipeline3 median: ', pipelines['pipeline3']['time'].median())

print('Means')
print('pipeline1 mean: ', pipelines['pipeline1']['time'].mean())
print('pipeline2 mean: ', pipelines['pipeline2']['time'].mean())
print('pipeline3 mean: ', pipelines['pipeline3']['time'].mean())

In [None]:
test_data = datasets.load_dataset('allenai/sciq', split='test').to_pandas()
test_data['support_length'] = test_data['support'].apply(lambda x: len(x))
print('Spearman correlation between time and support length')
for name, pipeline in pipelines.items():
    time_non_null = pipeline['time'].dropna()
    test_data_filtered = test_data.loc[time_non_null.index.astype('int32')]

    corr, p = spearmanr(test_data_filtered['support_length'], time_non_null)
    print(f'{name} corr={corr}, p={p}')