In [None]:
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from snorkel.labeling.model import LabelModel
from sklearn.metrics import f1_score
import matplotlib.pyplot as plt
import random
import numpy as np
from matplotlib.ticker import FuncFormatter
from snorkel.labeling import LFAnalysis
import seaborn as sns

# Ablation Study

In [None]:
SEED = 42

In [None]:
def get_train_dev_test_fold(fold, dataset, model_size, model_name="llama2_platypus", num_splits=10):
    assert fold < num_splits
    
    dataset_path = f"../data/processed/{dataset}/{model_name}/{model_size}/{dataset}.csv"
    df = pd.read_csv(dataset_path)
    skf = StratifiedKFold(n_splits=num_splits, shuffle=True, random_state=SEED)
    for j, (train_idxs, test_idxs) in enumerate(skf.split(range(len(df)), y=df["objective_true"].to_numpy())):
        train_df, test_df = df.iloc[train_idxs], df.iloc[test_idxs]
        print(len(train_df)/len(df), len(test_df)/len(df))

        if fold == j:
            return train_df, test_df

In [None]:
def predict_majority(row):
    if len(row) == 1:
        return row[0] if row[0] != -1 else np.random.choice([0, 1])
    else:
        # If there is a tie, randomly choose a class, else return the majority class
        counts = row.value_counts().to_dict()
        # get key with highest value
        if -1 in counts:
            del counts[-1]
        
        if len(counts) == 0:
            return np.random.choice([0, 1])
        else:
            return max(counts, key=counts.get)

In [None]:
signals_sorted_by_corr = ['Document Citation', 'Sensationalism', 'Misleading about content',
       'Evidence', 'Expert Citation', 'Emotional Valence',
       'Reported by Other Sources', 'Clickbait', 'Source Credibility', 'Bias',
       'Explicitly Unverified Claims', 'Polarizing Language', 'Informal Tone',
       'Incorrect Spelling', 'Incivility', 'Personal Perspective', 'Inference',
       'Impoliteness', 'Call to Action']

signals_sorted_by_corr.reverse()

all = []
best_signals_per_dataset = {}
for dataset in ["politifact", "fakenewsamt", "celebrity", "gossipcop"]:
    print(dataset)

    df = pd.read_csv(f"../data/signals/{dataset}.csv")

    # cross validation loop
    sf = StratifiedKFold(n_splits=10, shuffle=True, random_state=SEED)
    fold = 0
    for train_index, test_index in sf.split(df, df["objective_true"]):
        df_train, df_test = df.iloc[train_index], df.iloc[test_index]

        scores_by_num_signals = []
        
        random.seed()
        y_test_gold = df_test["objective_true"].to_numpy()

        for i in range(1,20):
            selected_signals = signals_sorted_by_corr[:i]
            L_ws_train = df_train.loc[:, selected_signals].to_numpy()
            L_ws_test = df_test.loc[:, selected_signals].to_numpy()

            label_model = LabelModel(cardinality=2, device="cpu", verbose=False)
            if i < 3:  # snorkel does not allow less than 3 signals, so append two columns with abstentions
                L_ws_train = np.concatenate([L_ws_train, np.zeros((len(L_ws_train), 3-i))-1], axis=1)
                L_ws_test = np.concatenate([L_ws_test, np.zeros((len(L_ws_test), 3-i))-1], axis=1)

            label_model.fit(L_ws_train, n_epochs=500, seed=SEED, progress_bar=False)
            y_pred_ws = label_model.predict(L=L_ws_test, tie_break_policy="random")
            val_f1_macro = f1_score(y_test_gold, y_pred_ws, average='macro', zero_division=0)

            d = {"dataset": dataset, "fold":fold, "f1": val_f1_macro, "#signals": i}
            all.append(d)

        fold += 1

In [None]:
# calculate the mean and stf of the f1 scores for each dataset and number of signals
df = pd.DataFrame(all)
df_grouped = df.groupby(["dataset", "#signals"]).mean().reset_index()
df_grouped["std"] = df.groupby(["dataset", "#signals"]).std().reset_index()["f1"]


In [None]:
fig, ax = plt.subplots(figsize=(9, 5))

df_grouped.loc[df_grouped["dataset"] == "politifact", "dataset"] = "PolitiFact"
df_grouped.loc[df_grouped["dataset"] == "gossipcop", "dataset"] = "GossipCop"
df_grouped.loc[df_grouped["dataset"] == "fakenewsamt", "dataset"] = "FakeNewsAMT"
df_grouped.loc[df_grouped["dataset"] == "celebrity", "dataset"] = "Celebrity"

# Get unique datasets
unique_datasets = df_grouped["dataset"].unique()
fontsize=20
# Plot each dataset with error bars
for dataset in unique_datasets:
    subset = df_grouped[df_grouped["dataset"] == dataset]
    # ax.errorbar(subset["#signals"], subset["mu"], yerr=subset["std_err"], label=dataset)
    ax.plot(subset["#signals"], subset["f1"], label=dataset, linewidth=2)

ax.set_xlabel('# Signals', fontsize=fontsize)
ax.set_ylabel("F1 Macro", fontsize=fontsize)
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.set_xlim(3, max(df["#signals"]))
ax.set_xticks([3] + list(df["#signals"]))
ax.tick_params(axis='y', labelsize=fontsize)
ax.tick_params(axis='x', labelsize=fontsize)
ax.set_ylim(0.30, 1.0)
plt.yticks(np.arange(0.30, 1.1, 0.1))
legend = ax.legend(fontsize=fontsize-5)
legend.get_title().set_fontsize(fontsize) 
ax.grid(True)

plt.tight_layout()
# plt.savefig(f"signal_ablation_sorted_corr_all.pdf", format="pdf", bbox_inches="tight")
plt.show()


# Signal Distributions and Correlations

In [None]:
def get_pairwise_correlation(df_signals):
    coefs_df = np.zeros((len(df_signals.columns), len(df_signals.columns)))
    for i, signal_i in enumerate(df_signals.columns):
        for j, signal_j in enumerate(df_signals.columns):
            corr = df_signals.loc[:, [signal_i, signal_j]]
            corr = corr[corr != -1] # remove pairwise abstentions
            coef = corr.corr().to_numpy()[0, 1]

            coefs_df[i, j] = coef

    df = pd.DataFrame(coefs_df, columns=df_signals.columns, index=df_signals.columns)
    return df

def plot_and_save_distribution(df_pos, df_neg, dataset_name):
    objectives = ['ABSTAIN', 'Misinformation', 'Not Misinformation']
    credibility_signals = df_pos.iloc[:, :19].columns
    n_signals = 19
    n_objectives = len(objectives)
    bar_width = 0.8 / n_objectives
    opacity = 0.7
    pos = range(n_signals)

    fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(20, 14), sharex=True)
    df = df_neg

    # Initialize dictionaries to store percentages
    percentages_0 = {}
    percentages_1 = {}
    percentages_minus_1 = {}

    # Calculate the percentages of each value in each column
    for col in df_pos.columns:
        counts = df[col].value_counts(normalize=True)
        percentages_0[col] = counts.get(0, 0) * 100
        percentages_1[col] = counts.get(1, 0) * 100
        percentages_minus_1[col] = counts.get(-1, 0) * 100

    # Create a bar plot
    width = 0.2  # Width of each bar
    x = range(len(df.columns))
    x_0 = [i - width for i in x]
    x_1 = x
    x_minus_1 = [i + width for i in x]

    ax1.bar(x_0, percentages_0.values(), width=width, label='No', align='center', color="green", alpha=opacity)
    ax1.bar(x_1, percentages_1.values(), width=width, label='Yes', align='center', color="red", alpha=opacity)
    ax1.bar(x_minus_1, percentages_minus_1.values(), width=width, label='Unsure', align='center', color="grey", alpha=opacity)

    df = df_pos
    # Initialize dictionaries to store percentages
    percentages_0 = {}
    percentages_1 = {}
    percentages_minus_1 = {}

    # Calculate the percentages of each value in each column
    for col in df.columns:
        counts = df[col].value_counts(normalize=True)
        percentages_0[col] = counts.get(0, 0) * 100
        percentages_1[col] = counts.get(1, 0) * 100
        percentages_minus_1[col] = counts.get(-1, 0) * 100

    # Create a bar plot
    width = 0.2  # Width of each bar
    x = range(len(df.columns))
    x_0 = [i - width for i in x]
    x_1 = x
    x_minus_1 = [i + width for i in x]

    ax2.bar(x_0, percentages_0.values(), width=width, label='No', align='center', color="green", alpha=opacity)
    ax2.bar(x_1, percentages_1.values(), width=width, label='Yes', align='center', color="red", alpha=opacity)
    ax2.bar(x_minus_1, percentages_minus_1.values(), width=width, label='Unsure', align='center', color="grey", alpha=opacity)

    # Set the x-axis labels
    ax1.set_xticks([p + (n_objectives - 1) * bar_width / 2 for p in pos])
    ax1.set_xticklabels(credibility_signals, rotation=45, ha='right', fontsize=20)
    # ax1.set_ylabel('Average Percentage', fontsize=20)
    ax1.set_title("Non-Misinformation Articles", fontsize=25)

    # Set the x-axis labels
    ax2.set_xticks([p + (n_objectives - 1) * bar_width / 2 for p in pos])
    ax2.set_xticklabels(credibility_signals, rotation=45, ha='right', fontsize=20)
    ax2.set_xlabel('Credibility Signal', fontsize=20)
    # ax2.set_ylabel('Mean', fontsize=20)
    ax2.set_title("Misinformation Articles", fontsize=25)

    ax1.tick_params(axis='y', labelsize=20)
    ax2.tick_params(axis='y', labelsize=20)

    # Create a single legend for both subplots
    handles1, labels1 = ax1.get_legend_handles_labels()
    handles2, labels2 = ax2.get_legend_handles_labels()
    handles = handles1 + handles2
    labels = labels1
    legend = ax1.legend(handles, labels, title='Vote', fontsize=15, ncol=len(objectives), bbox_to_anchor=(0.5, 1.35), loc='upper center')
    legend.get_title().set_fontsize(20)

    def percent_formatter(x, pos):
        return f"{int(x)}%"

    ax1.yaxis.set_major_formatter(FuncFormatter(percent_formatter))
    ax2.yaxis.set_major_formatter(FuncFormatter(percent_formatter))

    # Adjust the spacing between subplots
    plt.tight_layout()
    plt.show()

In [None]:
datasets = ["gossipcop", "fakenewsamt", "politifact", "celebrity"]
avg_correlations = []
avg_analysis = []
avg_corr_wrt_gt = []
all_dfs = []

signals = pd.read_csv("../data/signals.csv").iloc[:,1].tolist()
neg_percentages_0 = {k: [] for k in signals}
neg_percentages_1 = {k: [] for k in signals}
neg_percentages_minus_1 = {k: [] for k in signals}

pos_percentages_0 = {k: [] for k in signals}
pos_percentages_1 = {k: [] for k in signals}
pos_percentages_minus_1 = {k: [] for k in signals}
for dataset in datasets:
    df_path = f"../data/signals/{dataset}.csv"
    df = pd.read_csv(df_path)
    df = df.drop("objective_pred", axis=1)
    df = df.rename({"objective_true": "Ground Truth"}, axis=1)

    # print(dataset)
    df_signals = df.iloc[:, :19]
    y_true = df["Ground Truth"].to_numpy()
    correlation_df = get_pairwise_correlation(df.iloc[:, :20]) # Signals + ground truth


    lf_analysis_df = LFAnalysis(df_signals.to_numpy()).lf_summary(y_true).set_index(df_signals.columns)
    lf_analysis_df["Corr. wrt. GT"] = correlation_df["Ground Truth"]
    # print(lf_analysis_df["Corr. wrt. GT"])
    avg_corr_wrt_gt.append(lf_analysis_df["Corr. wrt. GT"].to_numpy())
    avg_correlations.append(correlation_df)
    avg_analysis.append(avg_analysis)
    all_dfs.append(df)
    
    df_neg = df[df["Ground Truth"] == 0].iloc[:, :19]
    df_pos = df[df["Ground Truth"] == 1].iloc[:, :19]

    plot_and_save_distribution(df_neg=df_neg, df_pos=df_pos, dataset_name=dataset)
    
    for col in signals:
        counts = df_neg[col].value_counts(normalize=True)
        neg_percentages_0[col].append(counts.get(0, 0) * 100)
        neg_percentages_1[col].append(counts.get(1, 0) * 100)
        neg_percentages_minus_1[col].append(counts.get(-1, 0) * 100)

        counts = df_pos[col].value_counts(normalize=True)
        pos_percentages_0[col].append(counts.get(0, 0) * 100)
        pos_percentages_1[col].append(counts.get(1, 0) * 100)
        pos_percentages_minus_1[col].append(counts.get(-1, 0) * 100)

for col in signals:
    neg_percentages_0[col] = {"mean": np.mean(neg_percentages_0[col]), "std": np.std(neg_percentages_0[col])}
    neg_percentages_1[col] = {"mean": np.mean(neg_percentages_1[col]), "std": np.std(neg_percentages_1[col])}
    neg_percentages_minus_1[col] = {"mean": np.mean(neg_percentages_minus_1[col]), "std": np.std(neg_percentages_minus_1[col])}

    pos_percentages_0[col] = {"mean": np.mean(pos_percentages_0[col]), "std": np.std(pos_percentages_0[col])}
    pos_percentages_1[col] = {"mean": np.mean(pos_percentages_1[col]), "std": np.std(pos_percentages_1[col])}
    pos_percentages_minus_1[col] = {"mean": np.mean(pos_percentages_minus_1[col]), "std": np.std(pos_percentages_minus_1[col])}

In [None]:
objectives = ['ABSTAIN', 'Misinformation', 'Not Misinformation']
credibility_signals = df.iloc[:, :19].columns
n_signals = 19
n_objectives = len(objectives)
bar_width = 0.8 / n_objectives
opacity = 0.7
pos = range(n_signals)

fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(20, 14), sharex=True)
df = df_pos

# Create a bar plot
width = 0.2  # Width of each bar
x = range(len(df.columns))
x_0 = [i - width for i in x]
x_1 = x
x_minus_1 = [i + width for i in x]

neg_means_0 = [v["mean"] for v in neg_percentages_0.values()]
neg_means_1 = [v["mean"] for v in neg_percentages_1.values()]
neg_means_minus_1 = [v["mean"] for v in neg_percentages_minus_1.values()]
neg_stds_0 = [v["std"] for v in neg_percentages_0.values()]
neg_stds_1 = [v["std"] for v in neg_percentages_1.values()]
neg_stds_minus_1 = [v["std"] for v in neg_percentages_minus_1.values()]

ax1.bar(x_0, neg_means_0, width=width, label='No', align='center', color="green", alpha=opacity)
ax1.errorbar(x_0, neg_means_0, yerr=neg_stds_0, fmt='none', ecolor='black')

ax1.bar(x_1, neg_means_1, width=width, label='Yes', align='center', color="red", alpha=opacity)
ax1.errorbar(x_1, neg_means_1, yerr=neg_stds_1, fmt='none', ecolor='black')

ax1.bar(x_minus_1, neg_means_minus_1, width=width, label='Unsure', align='center', color="grey", alpha=opacity)
ax1.errorbar(x_minus_1, neg_means_minus_1, yerr=neg_stds_minus_1, fmt='none', ecolor='black')

df = df_neg

# Create a bar plot
width = 0.2  # Width of each bar
x = range(len(df.columns))
x_0 = [i - width for i in x]
x_1 = x
x_minus_1 = [i + width for i in x]

pos_means_0 = [v["mean"] for v in pos_percentages_0.values()]
pos_means_1 = [v["mean"] for v in pos_percentages_1.values()]
pos_means_minus_1 = [v["mean"] for v in pos_percentages_minus_1.values()]
pos_stds_0 = [v["std"] for v in pos_percentages_0.values()]
pos_stds_1 = [v["std"] for v in pos_percentages_1.values()]
pos_stds_minus_1 = [v["std"] for v in pos_percentages_minus_1.values()]


ax2.bar(x_0, pos_means_0, width=width, label='No', align='center', color="green", alpha=opacity)
ax2.errorbar(x_0, pos_means_0, yerr=pos_stds_0, fmt='none', ecolor='black')

ax2.bar(x_1, pos_means_1, width=width, label='Yes', align='center', color="red", alpha=opacity)
ax2.errorbar(x_1, pos_means_1, yerr=pos_stds_1, fmt='none', ecolor='black')

ax2.bar(x_minus_1, pos_means_minus_1, width=width, label='Unsure', align='center', color="grey", alpha=opacity)
ax2.errorbar(x_minus_1, pos_means_minus_1, yerr=pos_stds_minus_1, fmt='none', ecolor='black')

# Set the x-axis labels
ax1.set_xticks([p + (n_objectives - 1) * bar_width / 2 for p in pos])
ax1.set_xticklabels(credibility_signals, rotation=45, ha='right', fontsize=20)
# ax1.set_ylabel('Average Percentage', fontsize=20)
ax1.set_title("Non-Misinformation Articles", fontsize=25)

# Set the x-axis labels
ax2.set_xticks([p + (n_objectives - 1) * bar_width / 2 for p in pos])
ax2.set_xticklabels(credibility_signals, rotation=45, ha='right', fontsize=20)
ax2.set_xlabel('Credibility Signal', fontsize=20)
# ax2.set_ylabel('Mean', fontsize=20)
ax2.set_title("Misinformation Articles", fontsize=25)

ax1.tick_params(axis='y', labelsize=20)
ax2.tick_params(axis='y', labelsize=20)

# Create a single legend for both subplots
handles1, labels1 = ax1.get_legend_handles_labels()
handles2, labels2 = ax2.get_legend_handles_labels()
handles = handles1 + handles2
labels = labels1
legend = ax1.legend(handles, labels, title='Vote', fontsize=15, ncol=len(objectives), bbox_to_anchor=(0.5, 1.35), loc='upper center')
legend.get_title().set_fontsize(20)

def percent_formatter(x, pos):
    return f"{int(x)}%"

ax1.yaxis.set_major_formatter(FuncFormatter(percent_formatter))
ax2.yaxis.set_major_formatter(FuncFormatter(percent_formatter))

# Adjust the spacing between subplots
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(10,8))
correlations_df = pd.DataFrame(data=avg_corr_wrt_gt, index=datasets, columns=df_signals.columns)
correlations_df = correlations_df.rename(
    {
        "celebrity": "Celebrity",
        "fakenewsamt": "FakeNewsAMT",
        "gossipcop": "GossipCop",
        "politifact": "PolitiFact"
    }, axis=0)
correlations_df.loc["Average"] = correlations_df.mean(axis=0)
correlation_df = correlations_df.sort_values("Average", axis=1, ascending=False) # sort by avg.
correlation_df = correlation_df.transpose() # make it a vertical plot
correlation_df = correlation_df[["Average", "PolitiFact", "FakeNewsAMT", "Celebrity", "GossipCop"]]
sns.heatmap(correlation_df, square=True, cmap="Reds", annot=True, fmt=".2f", cbar=False, cbar_kws={"size": 5})
plt.tight_layout()
# plt.savefig("outputs/correlations_per_dataset.pdf", format="pdf", bbox_inches="tight")
plt.show()