# Visualization

File contains code to visualize the results of the experiments

In [3]:
from collections import defaultdict
import evaluate
import json
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import os

In [None]:
# Labels for charts, tables
model_labels = {'electra-base-discriminator': 'ELECTRA', 'google': 'ELECTRA', 'google/electra-base-discriminator': 'ELECTRA', 'roberta-base': 'RoBERTa', 'bert-base-uncased': 'BERT', 'distilbert-base-uncased': 'DistilBERT'}
columns = ['Accuracy', 'Precision', 'Recall', 'F1']

### Visualization **manual evaluation**

In [None]:
# Annotations Dolly
dolly = pd.read_csv("dolly/data/train.csv", index_col='ID').reset_index()

# Annotations MARGOT
margot = pd.read_csv("margot/data/train.csv", index_col='ID').reset_index()

df_annotations = pd.merge(dolly, margot, how='inner', left_on='text', right_on='text', suffixes=('_dolly', '_margot')).set_index('ID_dolly')

# Only keep columns we need
df_annotations = df_annotations[['text', 'claim_margot', 'evidence_margot', 'claim_dolly', 'evidence_dolly', 'dataset_dolly', 'label_dolly']]
df_annotations.index.rename('ID', inplace=True)

# Remove options of <unk> (MARGOT)
df_annotations = df_annotations[df_annotations['claim_margot'] != '<unk>']
final_df_annotations = df_annotations[df_annotations['evidence_margot'] != '<unk>']

In [None]:
# Pick 5 random texts
sample = final_df_annotations.sample(10, random_state=42)

# Write out the preprocessed arugmentation-based for manual evaluation
sample.to_excel('manual.xlsx', index_label='ID')

In [None]:
# Load manual extractions
manual = pd.read_excel("manual_evaluation.xlsx", index_col='ID')
display(manual)

In [None]:
# Manually count the alignments
matrix_margot = np.array([
    [8, 1],
    [2, 9]
])

# Results Dolly
matrix_dolly = np.array([
    [8, 2],
    [1, 8]
])

matrix_dolly = matrix_dolly / 10 * 100
matrix_margot = matrix_margot / 10 * 100

In [None]:
# Plot confusion matrix
labels = ['Claim', 'Evidence']

fig = plt.figure(figsize=(4, 4))

ax = sns.heatmap(matrix_margot, 
            annot=False,
            cmap="Reds",
            cbar=False)

ax.set_xticklabels(labels, fontsize=8)
ax.xaxis.tick_top()
ax.yaxis.tick_left()
ax.set_yticklabels(labels, fontsize=8)


text_colors = [['white' if (i == 0 and j == 0) or (i == len(labels) - 1 and j == len(labels) - 1) else 'black'
                for j in range(len(labels))] for i in range(len(labels))]

for i in range(len(labels)):
    for j in range(len(labels)):
        lab = f'MARGOT: {matrix_margot[i, j]}%\nDolly 2.0: {matrix_dolly[i, j]}%'
        ax.text(j + 0.5, i + 0.5, lab,ha='center', va='center', color=text_colors[i][j], fontsize=7)

ax.set_xlabel("Manual extraction", fontsize=10)
ax.set_ylabel("Argument component extraction", fontsize=10)
ax.xaxis.set_label_position('top') 
ax.xaxis.labelpad = 7
ax.yaxis.labelpad = 7

fig.add_axes(ax)

fig.savefig('confusion.png')

### Visualization on **test** set

In [None]:
# Load baseline performance
with open("baseline/performance.json") as json_file:
    baseline_perf = json.load(json_file)

# Convert JSON to a table
idx = [model_labels[model] for model in baseline_perf]
values = [["{:.2f}".format(baseline_perf[model][metric]) for metric in baseline_perf[model]] for model in baseline_perf]
df_baseline = pd.DataFrame(values, index=idx, columns=columns)


In [None]:
# Load MARGOT
performance_margot = {}
for component_name in os.listdir("margot/results"):
    path_to_json = f"margot/results/{component_name}/json"
    for f in os.listdir(path_to_json):
        path = os.path.join(path_to_json, f)
        with open(path) as f:
            json_results = json.load(f)
            performance_margot[component_name] = json_results

# Convert JSON to table
cols, values = [], []
for component in performance_margot:
    tmp_col, idx = [], []
    for model, perf in performance_margot[component].items():

        idx.append(model_labels[model])

        metric = [(component, metric) for metric in performance_margot[component][model]]
        value = [(performance_margot[component][model][metric]) for metric in performance_margot[component][model]]
        for met, score in performance_margot[component][model].items():
            value = "{:.2f}".format(score)
            values.append(value)

    cols.append(metric)

# Flatten columns
cols = sum(cols, [])

# Create multi column index
multi_columns = pd.MultiIndex.from_tuples(cols, names=['component', 'metric'])

# Reshape the values
reshaped_values = np.array(values).reshape(-1, 12)  

df_margot = pd.DataFrame(reshaped_values, index=idx, columns=multi_columns)
display(df_margot)


In [None]:
# TODO: Replace by a function for MARGOT and Dolly 2.0
# Load Dolly 2.0
performance_dolly = {}
for component_name in os.listdir("dolly/results"):
    path_to_json = f"dolly/results/{component_name}/json"
    for f in os.listdir(path_to_json):
        path = os.path.join(path_to_json, f)
        with open(path) as f:
            json_results = json.load(f)
            performance_dolly[component_name] = json_results

# Convert JSON to table
cols, values = [], []
for component in performance_dolly:
    tmp_col, idx = [], []
    for model, perf in performance_dolly[component].items():

        idx.append(model_labels[model])

        metric = [(component, metric) for metric in performance_dolly[component][model]]
        value = [(performance_dolly[component][model][metric]) for metric in performance_dolly[component][model]]
        for met, score in performance_dolly[component][model].items():
            value = "{:.2f}".format(score)
            values.append(value)

    cols.append(metric)

# Flatten columns
cols = sum(cols, [])

# Create multi column index
multi_columns = pd.MultiIndex.from_tuples(cols, names=['component', 'metric'])

# Reshape the values
reshaped_values = np.array(values).reshape(-1, 12)  

df_dolly = pd.DataFrame(reshaped_values, index=idx, columns=multi_columns)
display(df_dolly)


### Visualization performance with respect to text length

##### After preprocessing

In [None]:
# Use baseline and argumentation dataset from earlier
df_baseline = df_baseline['text'].fillna('')
df_baseline['word_count'] = df_baseline['text'].str.split().apply(len)

df_arg = df_margot['text'].fillna('') # MARGOT and Dolly 2.0 have same clean text, so can be used interchangeable
df_arg['word_count'] = df_arg['text'].str.split().apply(len)

In [None]:
# Plot distribution
# Baseline and argumentation pipeline
sns.set_style("white")
sns.set_context('paper')

fig, ax = plt.subplots(2, 1, figsize=(5,6))
bins = 50

sns.histplot(data=df_baseline, x='word_count', bins=bins, ax=ax[0], color='black', label="Baseline pipeline")
sns.histplot(data=df_arg, x='word_count', bins=bins, ax=ax[1], color="#f37651", label="Argumentation-based pipeline")

ax[0].set_xticks(np.arange(0, 2000, 200))
ax[1].set_xticks(np.arange(0, 2000, 200))

ax[0].set_xlim([0, 2000])
ax[1].set_xlim([0, 2000])

ax[0].set(xlabel='Number of words', ylabel='Frequency')
ax[1].set(xlabel='Number of words', ylabel='Frequency')

ax[0].legend()
ax[1].legend()
fig.tight_layout(pad=2.0)

plt.show()

##### Performance on test set

In [None]:
def get_labels(path_to_labels):
    # Read test data with labels
    df_labels = pd.read_csv(f"{path_to_labels}.csv", usecols=['ID', 'text', 'label'], index_col='ID').dropna()

    # Convert string to int label
    df_labels["label"] = df_labels["label"].map({"FAKE": 0, "REAL": 1})

    # Count the number of words
    df_labels['word_count'] = df_labels['text'].str.split().apply(len)

    # Add bins depending on the words
    df_labels['bin_id'] = pd.cut(df_labels['word_count'], bins=[0, 100, 300, 10000], labels=["Short", "Medium", "Long"],right=True)
    return df_labels

In [None]:
metric = evaluate.combine(["f1"])

def process_predictions(df_labels, path_to_best_model_predictions):

    # Load predictions for best model
    df_predictions = pd.read_csv(f"{path_to_best_model_predictions}", usecols=['ID', 'prediction'], index_col='ID')

    # Combine predictions and correct labels
    df = pd.concat([df_labels, df_predictions], axis=1)

    # Group df by bin
    grouped_df = df.groupby('bin_id')

    # Calculate F1 score per bin
    f1_per_batch = []    
    for idx, data in grouped_df:
        labels = data['label'].values
        preds = data['prediction'].values
        result = metric.compute(labels, preds)
        print(idx, len(data), result['f1'])
        f1_per_batch.append(result['f1'])
    
    return f1_per_batch

In [None]:
def plot_lines(performance, ax):
    fig_labels = [i for i in performance]
    y_values = [performance[i] for i in performance]
    x_values = np.arange(1, len(y_values[0]) + 1)

    for i in range(len(y_values)):
        ax.plot(x_values, y_values[i], label=fig_labels[i])
    
    return ax

In [None]:
# Predictions
baseline_path_to_labels = "baseline/data/test"
base_path_best_model_preds = "baseline/results/roberta-base_predictions.csv"
performance_baseline = process_predictions(get_labels(baseline_path_to_labels), base_path_best_model_preds)

In [None]:
def per_component(path_to_labels, best_models):
    performance = {}
    df_labels = get_labels(path_to_labels)

    for comp in ['structure', 'claim', 'evidence']:
        f1_scores = process_predictions(df_labels, best_models[comp])
        performance[comp] = f1_scores
    return performance

In [None]:
# Add MARGOT 
path_to_labels = "models/argumentation-based/argumentation structure" # # MARGOT and Dolly 2.0 have same clean text, so can be used interchangeable

performance_margot = per_component(f"margot/models/test", "distilbert-base-uncased_predictions")
performance_dolly = per_component(f"/dolly/models/test", "distilbert-base-uncased_predictions")

In [None]:
# Reorder the labels for graph
performance_margot = {'Evidence': performance_margot['evidence'], 'Claim': performance_margot['claim'], 'Structure': performance_margot['structure']}
performance_dolly = {'Evidence': performance_dolly['evidence'], 'Claim': performance_dolly['claim'], 'Structure': performance_dolly['structure']}

In [None]:
# BASELINE + MARGOT 
sns.set_style("white")
sns.set_context('paper')

plt.figure(figsize=(6, 4))
ax = sns.lineplot(performance_baseline, marker='o',linestyle='--', color="black", markersize=5, label='Baseline')
ax.set_xticks(range(len(performance_baseline)))
ax.set_xticklabels(['Short', 'Medium', 'Long'])
ax.set(xlabel='Text length', ylabel='F1')
ax.set(title="MARGOT")

palette = sns.color_palette("Set2")

for i, (label, data) in enumerate(performance_margot.items()):
    print(label)
    sns.lineplot(x=[0, 1, 2], y=data, marker='D', label=label.capitalize(), markersize=5, ax=ax, color=palette[i])

plt.show()

In [None]:
# BASELINE + Dolly 2.0 # TODO: Write function for both 
sns.set_style("white")
sns.set_context('paper')

plt.figure(figsize=(6, 4))
ax = sns.lineplot(performance_baseline, marker='o',linestyle='--', color="black", markersize=5, label='Baseline')
ax.set_xticks(range(len(performance_baseline)))
ax.set_xticklabels(['Short', 'Medium', 'Long'])
ax.set(xlabel='Text length', ylabel='F1')
ax.set(title="MARGOT")

palette = sns.color_palette("Set2")

for i, (label, data) in enumerate(performance_margot.items()):
    print(label)
    sns.lineplot(x=[0, 1, 2], y=data, marker='D', label=label.capitalize(), markersize=5, ax=ax, color=palette[i])

plt.show()