In [None]:
import matplotlib.pyplot as plt
import numpy as np
plt.rcParams["font.family"] = "Times New Roman"
plt.rcParams['mathtext.fontset'] = 'dejavuserif'


data = {
 "votes_rome_cm": 89,
 "votes_rome_cn": 44,
 "votes_rome_cl": 17,
 "votes_rome_fm": 40,
 "votes_rome_fn": 45,
 "votes_rome_fl": 65,
 "votes_ft_l_cm": 45,
 "votes_ft_l_cn": 68,
 "votes_ft_l_cl": 37,
 "votes_ft_l_fm": 53,
 "votes_ft_l_fn": 56,
 "votes_ft_l_fl": 41,
 "votes_gpt_cm": 16,
 "votes_gpt_cn": 38,
 "votes_gpt_cl": 96,
 "votes_gpt_fm": 57,
 "votes_gpt_fn": 49,
 "votes_gpt_fl": 44,
}


for c in 'cf':
    
    x = np.arange(3)  # the label locations
    width = 0.3  # the width of the bars

    fig, ax = plt.subplots(figsize=(4,3), dpi=100)
    consistency = dict(c='Consistency', f='Fluency')[c]
    consistent = dict(c='consistent', f='fluent')[c]

    max_value = 0
    for m, offset in zip('mnl', [-0.3, 0.0, 0.3]):
        keys = [k for k in data.keys() if k[-2] == c and k[-1] == m]
        values = [data[k] for k in keys]
        max_value = max(max_value, max(values))
        rects = ax.bar(x + offset, values, width,
                       label=dict(m=f'Most {consistent}', n='Middle', l=f'Least {consistent}')[m],
                       color=dict(m='green', n='orange', l='red')[m])

        # Add some text for labels, title and custom x-axis tick labels, etc.
        ax.set_ylabel(f'Responses (out of 150)')
        ax.set_title(f'Human evaluation of {consistency}')
        ax.legend()
        ax.bar_label(rects, padding=3)
    ax.set_ylim(0, 120)
    ax.set_xticks(x, ['ROME', 'FT-L', 'GPT-2-XL'])
    fig.tight_layout()
    fig.savefig(f'human_evaluation_of_{consistency.lower()}.pdf')
    plt.show()


In [None]:
from collections import defaultdict

for c in 'cf':
    consistency = dict(c='Consistency', f='Fluency')[c]
    consistent = dict(c='consistent', f='fluent')[c]

    max_value = 0
    totals = defaultdict(int)
    for m, rank in zip('mnl', [1,2,3]):
        keys = [k for k in data.keys() if k[-2] == c and k[-1] == m]
        values = [data[k] for k in keys]
        exp = [k[6] for k in keys]
        for e, v in zip(exp, values):
            totals[e] += rank * v

    x = np.arange(3)  # the label locations
    width = 0.3  # the width of the bars

    fig, ax = plt.subplots(figsize=(4,3), dpi=100)
    values = [totals[e] / 150.0 for e in 'rfg']
    rects = ax.bar(x, [3.0-v for v in values], 0.7,
                       color=dict(c='green', f='orange')[c])
    #ax.spines['bottom'].set_position(('data', 3.0))
    # Add some text for labels, title and custom x-axis tick labels, etc.
    ax.set_ylabel(f'Average rank (of 150 3-way rankings)')
    ax.set_title(f'Human evaluation of {consistency}')
    ax.bar_label(rects, labels=['%.2f' % v for v in values], padding=3)
    ax.set_ylim(0, 2)
    ax.set_xticks(x, ['ROME', 'FT-L', 'GPT-2-XL'])
    ax.set_yticks(np.linspace(0, 2, 5), 3 - np.linspace(0, 2, 5))
    fig.tight_layout()
    fig.savefig(f'human_ranking_of_{consistency.lower()}.pdf')
    plt.show()


In [None]:
tw = {
 "rome_vs_ft_l_c": 97,
 "rome_vs_gpt_c": 125,
 "ft_l_vs_gpt_c": 105,
 "rome_vs_ft_l_f": 64,
 "rome_vs_gpt_f": 61,
 "ft_l_vs_gpt_f": 76
}

for c in 'cf':

    x = np.arange(2)  # the label locations
    width = 0.4  # the width of the bars

    fig, ax = plt.subplots(figsize=(4,3), dpi=100)
    consistency = dict(c='Consistency', f='Fluency')[c]
    consistent = dict(c='consistent', f='fluent')[c]

    max_value = 0
    for m, t, offset in zip('ml', [(lambda x: x), (lambda x: 150 - x)], [-0.2, 0.2]):
        keys = [f'rome_vs_{g}_{c}' for g in ['gpt', 'ft_l']]
        values = [t(tw[k]) for k in keys]
        max_value = max(max_value, max(values))
        rects = ax.bar(x + offset, values, width,
                       label=dict(m=f'ROME more {consistent}', n='Middle', l=f'ROME less {consistent}')[m],
                       color=dict(m='green', n='orange', l='red')[m])

        # Add some text for labels, title and custom x-axis tick labels, etc.
        ax.set_ylabel(f'Responses (out of 150)')
        ax.set_title(f'Human evaluation of {consistency}')
        ax.legend()
        ax.bar_label(rects, padding=3)
    ax.set_ylim(0, 150)
    ax.set_xticks(x, ['ROME vs GPT', 'ROME vs FT+L'])
    fig.tight_layout()
    fig.savefig(f'human_pairwise_{consistency.lower()}.pdf')
    plt.show()


In [None]:
for c in 'cf':

    x = np.arange(3)  # the label locations
    width = 0.4  # the width of the bars
    offset = 0.2
    
    fig, ax = plt.subplots(figsize=(5,3), dpi=100)
    consistency = dict(c='Consistency', f='Fluency')[c]
    consistent = dict(c='consistent', f='fluent')[c]

    max_value = 0
    sec = lambda x: 150-x
    rects1 = ax.bar([0-offset, 1-offset], [tw[f'rome_vs_gpt_{c}'], tw[f'rome_vs_ft_l_{c}']],
                    width=width,
                    label=f'ROME more {consistent}',
                    color='green')
    rects2 = ax.bar([0+offset, 2+offset], [sec(tw[f'rome_vs_gpt_{c}']), sec(tw[f'ft_l_vs_gpt_{c}'])],
                    width=width,
                    label=f'GPT more {consistent}',
                    color='red')
    rects3 = ax.bar([1+offset, 2-offset], [sec(tw[f'rome_vs_ft_l_{c}']), tw[f'ft_l_vs_gpt_{c}']],
                    width=width,
                    label=f'FT+L more {consistent}',
                    color='orange')
    # Add some text for labels, title and custom x-axis tick labels, etc.
    ax.set_ylabel(f'Responses (out of 150)')
    ax.set_title(f'Human evaluation of {consistency}')
    ax.legend(loc='upper center')
    for rects in [rects1, rects2, rects3]:
        ax.bar_label(rects, padding=3)
    ax.set_ylim(0, 180)
    ax.set_xticks(x, ['ROME vs GPT', 'ROME vs FT+L', 'FT+L vs GPT'])
    fig.tight_layout()
    fig.savefig(f'human_pairwise_{consistency.lower()}.pdf')
    plt.show()
