In [5]:
from lexical_substitution import *
import os 
import numpy as np
from scipy.stats import pearsonr, spearmanr
def plot_results(df, output_dir, comb, similarity, phase = 'dev', measure = 'recall', sum = False, find_best = False):
    matplotlib.rc('text', usetex=True)
    matplotlib.rcParams.update({'font.size': 55})
    palette = plt.get_cmap('tab20')
    if sum:
        measure = 'recall'

    fig, axs = plt.subplots(1, 1, figsize=(18, 18))

    value_exp = df[measure + '-expectation'].mean()
    if sum:
        value_exp += np.nanmean(list(df['gap-expectation']))

    value_word = np.nanmean(list(df[measure + '-lexical']))
    if sum:
        value_word +=  np.nanmean(list(df['gap-lexical']))

    if find_best:
        value_best = np.nanmean(list(df[measure + '-best']))
        if sum:
            value_best += np.nanmean(list(df['gap-best']))

    axs.axhline(y=value_exp, label='Expectation',  linestyle='-', linewidth=8,
                             color=palette(19))

    axs.axhline(y=value_word, label='Lexical',  linestyle='-', linewidth=8,
                             color=palette(7))

    scores_recall = []
    scores_gap = []
    params = []

    for col in df.columns:
        if 'recall-' in col and 'expectation' not in col and 'lexical' not in col and 'best' not in col:
            param = col.split('recall-')[1]
            params.append(float(param))
            param = str(param)
            scores_recall.append(np.nanmean(df['recall-'+str(param)]))
            scores_gap.append(np.nanmean(df['gap-'+str(param)]))
    
    boundary = True
    for p in params:
        val = df['sim_exp_output-' + str(p)].mean() > df['sim_lex_output-' + str(p)].mean()
        if not val:
            boundary = p
            break
            
    plt.axvline(x=boundary, label='Exp $>$ Lex', color=palette(5),linewidth=8, alpha=1, linestyle= 'dotted')
    
    if not sum:
        if measure == 'recall':
            values_combined = scores_recall
            measure_tmp = "\\textsc{Recall}-10"
        elif measure == 'gap':
            values_combined = scores_gap
            measure_tmp = "\\textsc{GAP}"
    else:
        values_combined = np.array(scores_recall) + np.array(scores_gap)
        measure_tmp = '\\textsc{Recall}-10 + \\textsc{GAP}'
        measure = 'recall+gap'
    if comb == 'weighted_avg':
        axs.set_xticks([round(i, 1) for i in list(np.array(range(0, 101, 10)) * 0.01)])
    else:
        axs.set_xticks([round(i, 1) for i in list(np.array(range(0, 301, 30)) * 0.01)])

    axs.set_xlabel('$\\alpha$', fontsize = 60)
    axs.set_ylabel(measure_tmp, fontsize=60)
    axs.plot(params, values_combined, label = 'Combined - constant $\\alpha$', color=palette(9), linewidth=8, alpha=1, markersize=18)
    
    
    if find_best:
        plt.axhline(y=value_best, label='Combined - optimal $\\alpha$', color=palette(13),linewidth=8, alpha=1, markersize=17)
    
    handles, labels = axs.get_legend_handles_labels()
    
    if find_best:
        fig.savefig(output_dir + phase + '_' + comb+ '_' + measure + '_' + similarity + '_withbest.png')
    else:
        fig.savefig(output_dir + phase + '_' + comb + '_' + measure + '_' + similarity + '.png')
    fig.clear()
    
    fig = plt.figure(figsize = (10, 6))
    fig.legend(handles, labels, loc='upper center',
                     ncol=1, shadow=True, fontsize= 50, facecolor='white', framealpha=1)

    fig.savefig(output_dir + phase + '_' + comb + '_' + measure + '_' + similarity + '_legend.png')
    fig.clear()
    


def plot_surprisal_output(test_df, param_best, output_dir, comb, similarity, phase = 'test'):
    print('surprisal')
    matplotlib.rc('text', usetex=True)
    matplotlib.rcParams.update({'font.size': 70})
    palette = plt.get_cmap('tab10')
    plt.figure(figsize = (20, 20))

    x = test_df['sim_exp_output-' + param_best]
    y = test_df['surprisal_word']
    print('expoutput-surprisal', spearmanr(x, y))

    plt.scatter(-x, y, color = palette(0))

    plt.xlabel('$ 1 - cosine (\mathbf{e}_c, \mathbf{i}_{v, c})$')
    plt.ylabel('$ - \log P (v| c; LM)$')
    plt.savefig(output_dir + phase + '_' + comb + '_' + similarity + '_surprisal-expoutput.png')
    plt.clf()

    x = test_df['sim_exp_output-best']
    y = test_df['surprisal_word']
    print('expoutput-surprisal best', spearmanr(x, y))

    plt.scatter(-x, y, color=palette(0))

    plt.xlabel('$ 1 -  cosine (\mathbf{e}_c, \mathbf{i}_{v, c})$')
    plt.ylabel('$ - \log P (v| c; LM)$')
    plt.savefig(output_dir + phase + '_' + comb + '_' + similarity + '_best_surprisal-expoutput.png')
    plt.clf()

    x = test_df['sim_lex_output-' + param_best]
    y = test_df['surprisal_word']
    print('lexoutput-surprisal', spearmanr(x, y))

    plt.scatter(-x, y, color = palette(0))

    plt.xlabel('$ 1 - cosine (\mathbf{l}_{v}, \mathbf{i}_{v, c})$')
    plt.ylabel('$ - \log P (v| c; LM)$')
    plt.savefig(output_dir + phase + '_' + comb + '_' + similarity + '_surprisal-lexoutput.png')
    plt.clf()

    x = test_df['sim_lex_output-best']
    y = test_df['surprisal_word']
    print('lexoutput-surprisal best', spearmanr(x, y))

    plt.scatter(-x, y, color=palette(0))

    plt.xlabel('$ 1 - cosine (\mathbf{l}_{v}, \mathbf{i}_{v, c})$')
    plt.ylabel('$ - \log P (v| c; LM)$')
    plt.savefig(output_dir + phase + '_' + comb + '_' + similarity + '_best_surprisal-lexoutput.png')
    plt.clf()

def plot_alpha_output(test_df, param_best, output_dir, comb, similarity, phase = 'test'):
    print('alpha')
    matplotlib.rc('text', usetex=True)
    matplotlib.rcParams.update({'font.size': 70})
    palette = plt.get_cmap('tab10')
    plt.figure(figsize = (20, 20))
    
    x = test_df['sim_exp_output-' + param_best]
    y = test_df['best_param']
    print('expoutput-surprisal', spearmanr(x, y))

    plt.scatter(x, y, color = palette(0))

    plt.xlabel('$ cosine (\mathbf{e}_c, \mathbf{i}_{v, c})$')
    plt.ylabel('optimal $\\alpha$')
    plt.savefig(output_dir + phase + '_' + comb + '_' + similarity + '_alpha-expoutput.png')
    plt.clf()

    x = test_df['sim_exp_output-best']
    y = test_df['best_param']
    print('expoutput-surprisal best', spearmanr(x, y))

    plt.scatter(x, y, color=palette(0))

    plt.xlabel('$ cosine (\mathbf{e}_c, \mathbf{i}_{v, c})$')
    plt.ylabel('optimal $\\alpha$')
    plt.savefig(output_dir + phase + '_' + comb + '_' + similarity + '_best_alpha-expoutput.png')
    plt.clf()

    x = test_df['sim_lex_output-' + param_best]
    y = test_df['best_param']
    print('lexoutput-surprisal', spearmanr(x, y))

    plt.scatter(x, y, color = palette(0))

    plt.xlabel('$ cosine (\mathbf{l}_{v}, \mathbf{i}_{v, c})$')
    plt.ylabel('optimal $\\alpha$')
    plt.savefig(output_dir + phase + '_' + comb + '_' + similarity + '_alpha-lexoutput.png')
    plt.clf()

    x = test_df['sim_lex_output-best']
    y = test_df['best_param']
    print('lexoutput-surprisal best', spearmanr(x, y))

    plt.scatter(x, y, color=palette(0))

    plt.xlabel('$ cosine (\mathbf{l}_{v}, \mathbf{i}_{v, c})$')
    plt.ylabel('optimal $\\alpha$')
    plt.savefig(output_dir + phase + '_' + comb + '_' + similarity + '_best_alpha-lexoutput.png')
    plt.clf()
    
def plot_results_together(output_dir, combinations, similarity, phase = 'dev', measure = 'recall', sum = False, find_best = False):
    combs_df = {}
    for comb in combinations:
        output_dir_tmp = output_dir + comb + '/'
        combs_df[comb] = pd.read_csv(output_dir_tmp + 'dev_' + comb + '_' + similarity + '_all.csv')
    
    matplotlib.rc('text', usetex=True)
    plt.rc('font', family='sans-erif')
    matplotlib.rcParams.update({'font.size': 90})
    palette = plt.get_cmap('tab20')
    matplotlib.rcParams['axes.linewidth'] = 3
    if sum:
        measure = 'recall'

    fig, axs = plt.subplots(1, 2, figsize=(40, 25), sharey=True)

    
    value_exp = combs_df['delta'][measure + '-expectation'].mean()
    if sum:
        value_exp += np.nanmean(list(combs_df['delta']['gap-expectation']))

    value_word = np.nanmean(list(combs_df['delta'][measure + '-lexical']))
    if sum:
        value_word +=  np.nanmean(list(combs_df['delta']['gap-lexical']))

    for ax in axs:
        ax.axhline(y=value_exp, label='Expectation',  linestyle='-', linewidth=18,
                             color=palette(19), marker = "^")

        ax.axhline(y=value_word, label='Lexical',  linestyle='-',linewidth=18,
                             color=palette(7), marker = "v")
    
    i = 0
    for comb in combinations:
        scores_recall = []
        scores_gap = []
        params = []
    
        if find_best:
            value_best = np.nanmean(list(combs_df[comb][measure + '-best']))
            if sum:
                value_best += np.nanmean(list(combs_df[comb]['gap-best']))
            
        for col in combs_df[comb].columns:
            if 'recall-' in col and 'expectation' not in col and 'lexical' not in col and 'best' not in col:
                param = col.split('recall-')[1]
                params.append(float(param))
                param = str(param)
                scores_recall.append(np.nanmean(combs_df[comb]['recall-'+str(param)]))
                scores_gap.append(np.nanmean(combs_df[comb]['gap-'+str(param)]))
    
        if not sum:
            if measure == 'recall':
                values_combined = scores_recall
                measure_tmp ="RECALL-10"
            elif measure == 'gap':
                values_combined = scores_gap
                measure_tmp = "GAP"
            measure_label = measure
        else:
            values_combined = np.array(scores_recall) + np.array(scores_gap)
            measure_tmp = 'RECALL-10 + GAP'
            measure_label = 'recall+gap'
        
        if comb == 'weighted_avg':
            axs[i].set_xticks([round(i, 1) for i in list(np.array(range(0, 101, 20)) * 0.01)])
            axs[i].set_xlabel('\\textit{avg-}$\\alpha$')
        else:
            axs[i].set_xticks([round(i, 1) for i in list(np.array(range(0, 301, 60)) * 0.01)])
            axs[i].set_xlabel('\\textit{delta-}$\\alpha$')
        
        axs[i].set_ylabel(measure_tmp)
        axs[i].plot(params, values_combined, label = 'Combined - constant $\\alpha$', 
                    color=palette(9), linewidth=18, alpha=1, markersize=18)

        if find_best:
            axs[i].axhline(y=value_best, label='Combined - optimal $\\alpha$', color=palette(13),linewidth=18, alpha=1, 
                           markersize=1)
        
        boundary = True
        for p in params:
            val = round(combs_df[comb]['sim_exp_output-' + str(p)].mean(),5) > round(combs_df[comb]['sim_lex_output-' + str(p)].mean(),5)
            if not val:
                boundary = p
                break
        axs[i].axvline(x=boundary, label='Exp vs. Lex', color=palette(5),linewidth=18, alpha=1, linestyle= 'dotted')
        
        handles, labels = axs[i].get_legend_handles_labels()
        i += 1
    plt.tight_layout()
    if find_best:
        fig.savefig(output_dir + phase + '_'  + measure_label + '_' + similarity + '_withbest.png')
    else:
        fig.savefig(output_dir + phase + '_' + measure_label + '_' + similarity + '.png')
    fig.clear()
    plt.tight_layout()
    fig = plt.figure(figsize = (40, 4))
    fig.legend(handles, labels, loc='upper center',
                     ncol=3, fontsize= 70, facecolor='white', framealpha = 0)

    fig.savefig(output_dir + phase + '_' + measure_label + '_' + similarity + '_legend.png')
    plt.show()
    

    
def analyze_results(output_dir, combinations, similarity, find_best = True):
    
    plot_results_together(output_dir,  combinations, similarity, measure='recall', find_best=find_best, phase = 'dev')
    plot_results_together(output_dir, combinations, similarity, measure='gap', find_best=find_best, phase = 'dev')
    plot_results_together(output_dir, combinations, similarity, sum=True, find_best=find_best, phase = 'dev')
    
#     for comb in combinations:
#         print(comb)
#         output_dir_tmp = output_dir + comb + '/'
#         dev_output= pd.read_csv(output_dir_tmp + 'dev_' + comb + '_' + similarity + '_all.csv')
#         test_output= pd.read_csv(output_dir_tmp + 'test_' + comb + '_' + similarity + '_all.csv')
        
# #         plot_results(dev_output, output_dir_tmp,  comb, similarity, measure='recall', find_best=find_best, phase = 'dev')
# #         plot_results(dev_output, output_dir_tmp, comb, similarity, measure='gap', find_best=find_best, phase = 'dev')
# #         plot_results(dev_output, output_dir_tmp, comb, similarity, sum=True, find_best=find_best, phase = 'dev')

#         # Find trained model with smallest validation loss
#         dev_scores = pd.read_csv(output_dir + 'dev' + '_' + similarity + '_all_scores_withbest.csv')
#         data_combination = dev_scores[(dev_scores.evaluated == 'combined_' + comb) & (dev_scores.alpha != 'best')]
#         best_model_row = data_combination.loc[data_combination['recall+gap'].idxmax()]
#         param_best = best_model_row['alpha']

# #         plot_surprisal_output(test_output, param_best, output_dir_tmp, comb, similarity, phase = 'test')
# #         plot_alpha_output(test_output, param_best, output_dir_tmp, comb, similarity, phase = 'test')
#         print(spearmanr(dev_output['surprisal_word'], 1 - np.array(dev_output['sim_exp_output-0.3'])))
#         print(spearmanr(dev_output['sim_exp_lex'], 1 - np.array(dev_output['sim_exp_output-0.3'])))

results_dir = '../results_20-1/lexical_substitution/'
combinations = ['weighted_avg', 'delta']
similarity = 'cosine'

for model_eval in os.listdir(results_dir): 
    print(model_eval)
    output_dir = results_dir + model_eval + '/'
    analyze_results(output_dir, combinations, similarity, find_best = True)

LSTM-tied


<Figure size 2880x1800 with 0 Axes>

<Figure size 2880x288 with 0 Axes>

<Figure size 2880x1800 with 0 Axes>

<Figure size 2880x288 with 0 Axes>

<Figure size 2880x1800 with 0 Axes>

<Figure size 2880x288 with 0 Axes>

LSTM-tiedT


<Figure size 2880x1800 with 0 Axes>

<Figure size 2880x288 with 0 Axes>

<Figure size 2880x1800 with 0 Axes>

<Figure size 2880x288 with 0 Axes>

<Figure size 2880x1800 with 0 Axes>

<Figure size 2880x288 with 0 Axes>

BERT-base


<Figure size 2880x1800 with 0 Axes>

<Figure size 2880x288 with 0 Axes>

<Figure size 2880x1800 with 0 Axes>

<Figure size 2880x288 with 0 Axes>

<Figure size 2880x1800 with 0 Axes>

<Figure size 2880x288 with 0 Axes>

BERT-large


<Figure size 2880x1800 with 0 Axes>

<Figure size 2880x288 with 0 Axes>

<Figure size 2880x1800 with 0 Axes>

<Figure size 2880x288 with 0 Axes>

<Figure size 2880x1800 with 0 Axes>

<Figure size 2880x288 with 0 Axes>