In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from snorkel.labeling import LFAnalysis
import numpy as np
from matplotlib.ticker import FuncFormatter

In [None]:
def get_pairwise_correlation(df_signals):
    coefs_df = np.zeros((len(df_signals.columns), len(df_signals.columns)))
    for i, signal_i in enumerate(df_signals.columns):
        for j, signal_j in enumerate(df_signals.columns):
            corr = df_signals.loc[:, [signal_i, signal_j]]
            corr = corr[corr != -1] # remove pairwise abstentions
            coef = corr.corr().to_numpy()[0, 1]

            coefs_df[i, j] = coef

    df = pd.DataFrame(coefs_df, columns=df_signals.columns, index=df_signals.columns)
    return df

def plot_and_save_distribution(df_pos, df_neg, dataset_name):
    objectives = ['ABSTAIN', 'Misinformation', 'Not Misinformation']
    credibility_signals = df_pos.iloc[:, :19].columns
    n_signals = 19
    n_objectives = len(objectives)
    bar_width = 0.8 / n_objectives
    opacity = 0.7
    pos = range(n_signals)

    fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(20, 14), sharex=True)
    df = df_neg

    # Initialize dictionaries to store percentages
    percentages_0 = {}
    percentages_1 = {}
    percentages_minus_1 = {}

    # Calculate the percentages of each value in each column
    for col in df_pos.columns:
        counts = df[col].value_counts(normalize=True)
        percentages_0[col] = counts.get(0, 0) * 100
        percentages_1[col] = counts.get(1, 0) * 100
        percentages_minus_1[col] = counts.get(-1, 0) * 100

    # Create a bar plot
    width = 0.2  # Width of each bar
    x = range(len(df.columns))
    x_0 = [i - width for i in x]
    x_1 = x
    x_minus_1 = [i + width for i in x]

    ax1.bar(x_0, percentages_0.values(), width=width, label='No', align='center', color="green", alpha=opacity)
    ax1.bar(x_1, percentages_1.values(), width=width, label='Yes', align='center', color="red", alpha=opacity)
    ax1.bar(x_minus_1, percentages_minus_1.values(), width=width, label='Unsure', align='center', color="grey", alpha=opacity)

    df = df_pos
    # Initialize dictionaries to store percentages
    percentages_0 = {}
    percentages_1 = {}
    percentages_minus_1 = {}

    # Calculate the percentages of each value in each column
    for col in df.columns:
        counts = df[col].value_counts(normalize=True)
        percentages_0[col] = counts.get(0, 0) * 100
        percentages_1[col] = counts.get(1, 0) * 100
        percentages_minus_1[col] = counts.get(-1, 0) * 100

    # Create a bar plot
    width = 0.2  # Width of each bar
    x = range(len(df.columns))
    x_0 = [i - width for i in x]
    x_1 = x
    x_minus_1 = [i + width for i in x]

    ax2.bar(x_0, percentages_0.values(), width=width, label='No', align='center', color="green", alpha=opacity)
    ax2.bar(x_1, percentages_1.values(), width=width, label='Yes', align='center', color="red", alpha=opacity)
    ax2.bar(x_minus_1, percentages_minus_1.values(), width=width, label='Unsure', align='center', color="grey", alpha=opacity)

    # Set the x-axis labels
    ax1.set_xticks([p + (n_objectives - 1) * bar_width / 2 for p in pos])
    ax1.set_xticklabels(credibility_signals, rotation=45, ha='right', fontsize=20)
    # ax1.set_ylabel('Average Percentage', fontsize=20)
    ax1.set_title("Non-Misinformation Articles", fontsize=25)

    # Set the x-axis labels
    ax2.set_xticks([p + (n_objectives - 1) * bar_width / 2 for p in pos])
    ax2.set_xticklabels(credibility_signals, rotation=45, ha='right', fontsize=20)
    ax2.set_xlabel('Credibility Signal', fontsize=20)
    # ax2.set_ylabel('Mean', fontsize=20)
    ax2.set_title("Misinformation Articles", fontsize=25)

    ax1.tick_params(axis='y', labelsize=20)
    ax2.tick_params(axis='y', labelsize=20)

    # Create a single legend for both subplots
    handles1, labels1 = ax1.get_legend_handles_labels()
    handles2, labels2 = ax2.get_legend_handles_labels()
    handles = handles1 + handles2
    labels = labels1
    legend = ax1.legend(handles, labels, title='Vote', fontsize=15, ncol=len(objectives), bbox_to_anchor=(0.5, 1.35), loc='upper center')
    legend.get_title().set_fontsize(20)

    def percent_formatter(x, pos):
        return f"{int(x)}%"

    ax1.yaxis.set_major_formatter(FuncFormatter(percent_formatter))
    ax2.yaxis.set_major_formatter(FuncFormatter(percent_formatter))

    # Adjust the spacing between subplots
    plt.tight_layout()
    plt.savefig(f"outputs/{dataset_name}_signal_distributions.pdf", format="pdf", bbox_inches="tight")

In [None]:
datasets = ["gossipcop", "fakenewsdataset", "politifact", "celebritydataset"]
avg_correlations = []
avg_analysis = []
avg_corr_wrt_gt = []
all_dfs = []

signals = pd.read_csv("../data/signals.csv").iloc[:,1].tolist()
neg_percentages_0 = {k: [] for k in signals}
neg_percentages_1 = {k: [] for k in signals}
neg_percentages_minus_1 = {k: [] for k in signals}

pos_percentages_0 = {k: [] for k in signals}
pos_percentages_1 = {k: [] for k in signals}
pos_percentages_minus_1 = {k: [] for k in signals}
for dataset in datasets:
    df_path = f"../data/processed/{dataset}/llama2_platypus/70/{dataset}.csv"
    df = pd.read_csv(df_path)
    df = df.drop("objective_pred", axis=1)
    df = df.rename({"objective_true": "Ground Truth"}, axis=1)

    # print(dataset)
    df_signals = df.iloc[:, :19]
    y_true = df["Ground Truth"].to_numpy()
    correlation_df = get_pairwise_correlation(df.iloc[:, :20]) # Signals + ground truth


    lf_analysis_df = LFAnalysis(df_signals.to_numpy()).lf_summary(y_true).set_index(df_signals.columns)
    lf_analysis_df["Corr. wrt. GT"] = correlation_df["Ground Truth"]
    # print(lf_analysis_df["Corr. wrt. GT"])
    avg_corr_wrt_gt.append(lf_analysis_df["Corr. wrt. GT"].to_numpy())
    avg_correlations.append(correlation_df)
    avg_analysis.append(avg_analysis)
    all_dfs.append(df)
    
    df_neg = df[df["Ground Truth"] == 0].iloc[:, :19]
    df_pos = df[df["Ground Truth"] == 1].iloc[:, :19]

    plot_and_save_distribution(df_neg=df_neg, df_pos=df_pos, dataset_name=dataset)
    
    for col in signals:
        counts = df_neg[col].value_counts(normalize=True)
        neg_percentages_0[col].append(counts.get(0, 0) * 100)
        neg_percentages_1[col].append(counts.get(1, 0) * 100)
        neg_percentages_minus_1[col].append(counts.get(-1, 0) * 100)

        counts = df_pos[col].value_counts(normalize=True)
        pos_percentages_0[col].append(counts.get(0, 0) * 100)
        pos_percentages_1[col].append(counts.get(1, 0) * 100)
        pos_percentages_minus_1[col].append(counts.get(-1, 0) * 100)

for col in signals:
    neg_percentages_0[col] = {"mean": np.mean(neg_percentages_0[col]), "std": np.std(neg_percentages_0[col])}
    neg_percentages_1[col] = {"mean": np.mean(neg_percentages_1[col]), "std": np.std(neg_percentages_1[col])}
    neg_percentages_minus_1[col] = {"mean": np.mean(neg_percentages_minus_1[col]), "std": np.std(neg_percentages_minus_1[col])}

    pos_percentages_0[col] = {"mean": np.mean(pos_percentages_0[col]), "std": np.std(pos_percentages_0[col])}
    pos_percentages_1[col] = {"mean": np.mean(pos_percentages_1[col]), "std": np.std(pos_percentages_1[col])}
    pos_percentages_minus_1[col] = {"mean": np.mean(pos_percentages_minus_1[col]), "std": np.std(pos_percentages_minus_1[col])}

In [None]:
objectives = ['ABSTAIN', 'Misinformation', 'Not Misinformation']
credibility_signals = df.iloc[:, :19].columns
n_signals = 19
n_objectives = len(objectives)
bar_width = 0.8 / n_objectives
opacity = 0.7
pos = range(n_signals)

fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(20, 14), sharex=True)
df = df_pos

# Create a bar plot
width = 0.2  # Width of each bar
x = range(len(df.columns))
x_0 = [i - width for i in x]
x_1 = x
x_minus_1 = [i + width for i in x]

neg_means_0 = [v["mean"] for v in neg_percentages_0.values()]
neg_means_1 = [v["mean"] for v in neg_percentages_1.values()]
neg_means_minus_1 = [v["mean"] for v in neg_percentages_minus_1.values()]
neg_stds_0 = [v["std"] for v in neg_percentages_0.values()]
neg_stds_1 = [v["std"] for v in neg_percentages_1.values()]
neg_stds_minus_1 = [v["std"] for v in neg_percentages_minus_1.values()]

ax1.bar(x_0, neg_means_0, width=width, label='No', align='center', color="green", alpha=opacity)
ax1.errorbar(x_0, neg_means_0, yerr=neg_stds_0, fmt='none', ecolor='black')

ax1.bar(x_1, neg_means_1, width=width, label='Yes', align='center', color="red", alpha=opacity)
ax1.errorbar(x_1, neg_means_1, yerr=neg_stds_1, fmt='none', ecolor='black')

ax1.bar(x_minus_1, neg_means_minus_1, width=width, label='Unsure', align='center', color="grey", alpha=opacity)
ax1.errorbar(x_minus_1, neg_means_minus_1, yerr=neg_stds_minus_1, fmt='none', ecolor='black')

df = df_neg

# Create a bar plot
width = 0.2  # Width of each bar
x = range(len(df.columns))
x_0 = [i - width for i in x]
x_1 = x
x_minus_1 = [i + width for i in x]

pos_means_0 = [v["mean"] for v in pos_percentages_0.values()]
pos_means_1 = [v["mean"] for v in pos_percentages_1.values()]
pos_means_minus_1 = [v["mean"] for v in pos_percentages_minus_1.values()]
pos_stds_0 = [v["std"] for v in pos_percentages_0.values()]
pos_stds_1 = [v["std"] for v in pos_percentages_1.values()]
pos_stds_minus_1 = [v["std"] for v in pos_percentages_minus_1.values()]


ax2.bar(x_0, pos_means_0, width=width, label='No', align='center', color="green", alpha=opacity)
ax2.errorbar(x_0, pos_means_0, yerr=pos_stds_0, fmt='none', ecolor='black')

ax2.bar(x_1, pos_means_1, width=width, label='Yes', align='center', color="red", alpha=opacity)
ax2.errorbar(x_1, pos_means_1, yerr=pos_stds_1, fmt='none', ecolor='black')

ax2.bar(x_minus_1, pos_means_minus_1, width=width, label='Unsure', align='center', color="grey", alpha=opacity)
ax2.errorbar(x_minus_1, pos_means_minus_1, yerr=pos_stds_minus_1, fmt='none', ecolor='black')

# Set the x-axis labels
ax1.set_xticks([p + (n_objectives - 1) * bar_width / 2 for p in pos])
ax1.set_xticklabels(credibility_signals, rotation=45, ha='right', fontsize=20)
# ax1.set_ylabel('Average Percentage', fontsize=20)
ax1.set_title("Non-Misinformation Articles", fontsize=25)

# Set the x-axis labels
ax2.set_xticks([p + (n_objectives - 1) * bar_width / 2 for p in pos])
ax2.set_xticklabels(credibility_signals, rotation=45, ha='right', fontsize=20)
ax2.set_xlabel('Credibility Signal', fontsize=20)
# ax2.set_ylabel('Mean', fontsize=20)
ax2.set_title("Misinformation Articles", fontsize=25)

ax1.tick_params(axis='y', labelsize=20)
ax2.tick_params(axis='y', labelsize=20)

# Create a single legend for both subplots
handles1, labels1 = ax1.get_legend_handles_labels()
handles2, labels2 = ax2.get_legend_handles_labels()
handles = handles1 + handles2
labels = labels1
legend = ax1.legend(handles, labels, title='Vote', fontsize=15, ncol=len(objectives), bbox_to_anchor=(0.5, 1.35), loc='upper center')
legend.get_title().set_fontsize(20)

def percent_formatter(x, pos):
    return f"{int(x)}%"

ax1.yaxis.set_major_formatter(FuncFormatter(percent_formatter))
ax2.yaxis.set_major_formatter(FuncFormatter(percent_formatter))

# Adjust the spacing between subplots
plt.tight_layout()
plt.savefig("outputs/avg_signal_distributions.pdf", format="pdf", bbox_inches="tight")

In [None]:
plt.figure(figsize=(10,8))
correlations_df = pd.DataFrame(data=avg_corr_wrt_gt, index=datasets, columns=df_signals.columns)
correlations_df = correlations_df.rename(
    {
        "celebritydataset": "Celebrity",
        "fakenewsdataset": "FakeNewsAMT",
        "gossipcop": "GossipCop",
        "politifact": "PolitiFact"
    }, axis=0)
correlations_df.loc["Average"] = correlations_df.mean(axis=0)
correlation_df = correlations_df.sort_values("Average", axis=1, ascending=False) # sort by avg.
correlation_df = correlation_df.transpose() # make it a vertical plot
correlation_df = correlation_df[["Average", "PolitiFact", "FakeNewsAMT", "Celebrity", "GossipCop"]]
sns.heatmap(correlation_df, square=True, cmap="Reds", annot=True, fmt=".2f", cbar=False, cbar_kws={"size": 5})
plt.tight_layout()
plt.savefig("outputs/correlations_per_dataset.pdf", format="pdf", bbox_inches="tight")

In [None]:
# pairwise pearson correlation between all signals + veracity. Averaged across all datasets
sns.heatmap(pd.DataFrame(np.mean(avg_correlations, axis=0), index=avg_correlations[0].index, columns=avg_correlations[0].columns))

In [None]:
df_neg = df[df["Ground Truth"] == 0].iloc[:, :19]
df_pos = df[df["Ground Truth"] == 1].iloc[:, :19]

signals = df_pos.columns
neg_percentages_0 = {k: [] for k in signals}
neg_percentages_1 = {k: [] for k in signals}
neg_percentages_minus_1 = {k: [] for k in signals}

pos_percentages_0 = {k: [] for k in signals}
pos_percentages_1 = {k: [] for k in signals}
pos_percentages_minus_1 = {k: [] for k in signals}

for col in signals:
    counts = df_neg[col].value_counts(normalize=True)
    neg_percentages_0[col].append(counts.get(0, 0) * 100)
    neg_percentages_1[col].append(counts.get(1, 0) * 100)
    neg_percentages_minus_1[col].append(counts.get(-1, 0) * 100)

    counts = df_pos[col].value_counts(normalize=True)
    pos_percentages_0[col].append(counts.get(0, 0) * 100)
    pos_percentages_1[col].append(counts.get(1, 0) * 100)
    pos_percentages_minus_1[col].append(counts.get(-1, 0) * 100)

In [None]:
neg_percentages_0

In [None]:
df = pd.concat(all_dfs)

In [None]:
percentages_0_neg