In [1]:
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from snorkel.labeling.model import LabelModel
from sklearn.metrics import f1_score
import matplotlib.pyplot as plt
import random
import numpy as np
from matplotlib.ticker import FuncFormatter
from snorkel.labeling import LFAnalysis
import seaborn as sns
from scipy.stats import chisquare

# Signal Distributions and Correlations

In [2]:
def get_pairwise_correlation(df_signals):
    coefs_df = np.zeros((len(df_signals.columns), len(df_signals.columns)))
    for i, signal_i in enumerate(df_signals.columns):
        for j, signal_j in enumerate(df_signals.columns):
            corr = df_signals.loc[:, [signal_i, signal_j]]
            corr = corr[corr != -1] # remove pairwise abstentions
            coef = corr.corr().to_numpy()[0, 1]

            coefs_df[i, j] = coef

    df = pd.DataFrame(coefs_df, columns=df_signals.columns, index=df_signals.columns)
    return df

def plot_and_save_distribution(df_pos, df_neg, dataset_name):
    objectives = ['ABSTAIN', 'Misinformation', 'Not Misinformation']
    credibility_signals = df_pos.iloc[:, :19].columns
    n_signals = 19
    n_objectives = len(objectives)
    bar_width = 0.8 / n_objectives
    opacity = 0.7
    pos = range(n_signals)

    fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(20, 14), sharex=True)
    df = df_neg

    # Initialize dictionaries to store percentages
    percentages_0 = {}
    percentages_1 = {}
    percentages_minus_1 = {}

    # Calculate the percentages of each value in each column
    for col in df_pos.columns:
        counts = df[col].value_counts(normalize=True)
        percentages_0[col] = counts.get(0, 0) * 100
        percentages_1[col] = counts.get(1, 0) * 100
        percentages_minus_1[col] = counts.get(-1, 0) * 100

    # Create a bar plot
    width = 0.2  # Width of each bar
    x = range(len(df.columns))
    x_0 = [i - width for i in x]
    x_1 = x
    x_minus_1 = [i + width for i in x]

    ax1.bar(x_0, percentages_0.values(), width=width, label='No', align='center', color="green", alpha=opacity)
    ax1.bar(x_1, percentages_1.values(), width=width, label='Yes', align='center', color="red", alpha=opacity)
    ax1.bar(x_minus_1, percentages_minus_1.values(), width=width, label='Unsure', align='center', color="grey", alpha=opacity)

    df = df_pos
    # Initialize dictionaries to store percentages
    percentages_0 = {}
    percentages_1 = {}
    percentages_minus_1 = {}

    # Calculate the percentages of each value in each column
    for col in df.columns:
        counts = df[col].value_counts(normalize=True)
        percentages_0[col] = counts.get(0, 0) * 100
        percentages_1[col] = counts.get(1, 0) * 100
        percentages_minus_1[col] = counts.get(-1, 0) * 100

    # Create a bar plot
    width = 0.2  # Width of each bar
    x = range(len(df.columns))
    x_0 = [i - width for i in x]
    x_1 = x
    x_minus_1 = [i + width for i in x]

    ax2.bar(x_0, percentages_0.values(), width=width, label='No', align='center', color="green", alpha=opacity)
    ax2.bar(x_1, percentages_1.values(), width=width, label='Yes', align='center', color="red", alpha=opacity)
    ax2.bar(x_minus_1, percentages_minus_1.values(), width=width, label='Unsure', align='center', color="grey", alpha=opacity)

    # Set the x-axis labels
    ax1.set_xticks([p + (n_objectives - 1) * bar_width / 2 for p in pos])
    ax1.set_xticklabels(credibility_signals, rotation=45, ha='right', fontsize=20)
    # ax1.set_ylabel('Average Percentage', fontsize=20)
    ax1.set_title("Non-Misinformation Articles", fontsize=25)

    # Set the x-axis labels
    ax2.set_xticks([p + (n_objectives - 1) * bar_width / 2 for p in pos])
    ax2.set_xticklabels(credibility_signals, rotation=45, ha='right', fontsize=20)
    ax2.set_xlabel('Credibility Signal', fontsize=20)
    # ax2.set_ylabel('Mean', fontsize=20)
    ax2.set_title("Misinformation Articles", fontsize=25)

    ax1.tick_params(axis='y', labelsize=20)
    ax2.tick_params(axis='y', labelsize=20)

    # Create a single legend for both subplots
    handles1, labels1 = ax1.get_legend_handles_labels()
    handles2, labels2 = ax2.get_legend_handles_labels()
    handles = handles1 + handles2
    labels = labels1
    legend = ax1.legend(handles, labels, title='Vote', fontsize=15, ncol=len(objectives), bbox_to_anchor=(0.5, 1.35), loc='upper center')
    legend.get_title().set_fontsize(20)

    def percent_formatter(x, pos):
        return f"{int(x)}%"

    ax1.yaxis.set_major_formatter(FuncFormatter(percent_formatter))
    ax2.yaxis.set_major_formatter(FuncFormatter(percent_formatter))

    # Adjust the spacing between subplots
    plt.tight_layout()
    plt.show()

In [None]:
datasets = ["gossipcop", "fakenewsamt", "politifact", "celebrity"]
avg_correlations = []
avg_analysis = []
avg_corr_wrt_gt = []
all_dfs = []
all_neg = []
all_pos = []

signals = pd.read_csv("../data/signals.csv").iloc[:,1].tolist()
neg_percentages_0 = {k: [] for k in signals}
neg_percentages_1 = {k: [] for k in signals}
neg_percentages_minus_1 = {k: [] for k in signals}

pos_percentages_0 = {k: [] for k in signals}
pos_percentages_1 = {k: [] for k in signals}
pos_percentages_minus_1 = {k: [] for k in signals}
for dataset in datasets:
    df_path = f"../data/signals/{dataset}.csv"
    df = pd.read_csv(df_path)
    df = df.drop("objective_pred", axis=1)
    df = df.rename({"objective_true": "Ground Truth"}, axis=1)

    # print(dataset)
    df_signals = df.iloc[:, :19]
    y_true = df["Ground Truth"].to_numpy()
    correlation_df = get_pairwise_correlation(df.iloc[:, :20]) # Signals + ground truth


    lf_analysis_df = LFAnalysis(df_signals.to_numpy()).lf_summary(y_true).set_index(df_signals.columns)
    lf_analysis_df["Corr. wrt. GT"] = correlation_df["Ground Truth"]
    # print(lf_analysis_df["Corr. wrt. GT"])
    avg_corr_wrt_gt.append(lf_analysis_df["Corr. wrt. GT"].to_numpy())
    avg_correlations.append(correlation_df)
    avg_analysis.append(avg_analysis)
    all_dfs.append(df)
    
    df_neg = df[df["Ground Truth"] == 0].iloc[:, :19]
    df_pos = df[df["Ground Truth"] == 1].iloc[:, :19]

    all_neg.append(df_neg)
    all_pos.append(df_pos)
    plot_and_save_distribution(df_neg=df_neg, df_pos=df_pos, dataset_name=dataset)
    
    for col in signals:
        counts = df_neg[col].value_counts(normalize=True)
        neg_percentages_0[col].append(counts.get(0, 0) * 100)
        neg_percentages_1[col].append(counts.get(1, 0) * 100)
        neg_percentages_minus_1[col].append(counts.get(-1, 0) * 100)

        counts = df_pos[col].value_counts(normalize=True)
        pos_percentages_0[col].append(counts.get(0, 0) * 100)
        pos_percentages_1[col].append(counts.get(1, 0) * 100)
        pos_percentages_minus_1[col].append(counts.get(-1, 0) * 100)

for col in signals:
    neg_percentages_0[col] = {"mean": np.mean(neg_percentages_0[col]), "std": np.std(neg_percentages_0[col])}
    neg_percentages_1[col] = {"mean": np.mean(neg_percentages_1[col]), "std": np.std(neg_percentages_1[col])}
    neg_percentages_minus_1[col] = {"mean": np.mean(neg_percentages_minus_1[col]), "std": np.std(neg_percentages_minus_1[col])}

    pos_percentages_0[col] = {"mean": np.mean(pos_percentages_0[col]), "std": np.std(pos_percentages_0[col])}
    pos_percentages_1[col] = {"mean": np.mean(pos_percentages_1[col]), "std": np.std(pos_percentages_1[col])}
    pos_percentages_minus_1[col] = {"mean": np.mean(pos_percentages_minus_1[col]), "std": np.std(pos_percentages_minus_1[col])}

In [None]:
objectives = ['ABSTAIN', 'Misinformation', 'Not Misinformation']
credibility_signals = df.iloc[:, :19].columns
n_signals = 19
n_objectives = len(objectives)
bar_width = 0.8 / n_objectives
opacity = 0.7
pos = range(n_signals)

fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(20, 14), sharex=True)
df = df_pos

# Create a bar plot
width = 0.2  # Width of each bar
x = range(len(df.columns))
x_0 = [i - width for i in x]
x_1 = x
x_minus_1 = [i + width for i in x]

neg_means_0 = [v["mean"] for v in neg_percentages_0.values()]
neg_means_1 = [v["mean"] for v in neg_percentages_1.values()]
neg_means_minus_1 = [v["mean"] for v in neg_percentages_minus_1.values()]
neg_stds_0 = [v["std"] for v in neg_percentages_0.values()]
neg_stds_1 = [v["std"] for v in neg_percentages_1.values()]
neg_stds_minus_1 = [v["std"] for v in neg_percentages_minus_1.values()]

ax1.bar(x_0, neg_means_0, width=width, label='No', align='center', color="green", alpha=opacity)
ax1.errorbar(x_0, neg_means_0, yerr=neg_stds_0, fmt='none', ecolor='black')

ax1.bar(x_1, neg_means_1, width=width, label='Yes', align='center', color="red", alpha=opacity)
ax1.errorbar(x_1, neg_means_1, yerr=neg_stds_1, fmt='none', ecolor='black')

ax1.bar(x_minus_1, neg_means_minus_1, width=width, label='Unsure', align='center', color="grey", alpha=opacity)
ax1.errorbar(x_minus_1, neg_means_minus_1, yerr=neg_stds_minus_1, fmt='none', ecolor='black')

df = df_neg

# Create a bar plot
width = 0.2  # Width of each bar
x = range(len(df.columns))
x_0 = [i - width for i in x]
x_1 = x
x_minus_1 = [i + width for i in x]

pos_means_0 = [v["mean"] for v in pos_percentages_0.values()]
pos_means_1 = [v["mean"] for v in pos_percentages_1.values()]
pos_means_minus_1 = [v["mean"] for v in pos_percentages_minus_1.values()]
pos_stds_0 = [v["std"] for v in pos_percentages_0.values()]
pos_stds_1 = [v["std"] for v in pos_percentages_1.values()]
pos_stds_minus_1 = [v["std"] for v in pos_percentages_minus_1.values()]


ax2.bar(x_0, pos_means_0, width=width, label='No', align='center', color="green", alpha=opacity)
ax2.errorbar(x_0, pos_means_0, yerr=pos_stds_0, fmt='none', ecolor='black')

ax2.bar(x_1, pos_means_1, width=width, label='Yes', align='center', color="red", alpha=opacity)
ax2.errorbar(x_1, pos_means_1, yerr=pos_stds_1, fmt='none', ecolor='black')

ax2.bar(x_minus_1, pos_means_minus_1, width=width, label='Unsure', align='center', color="grey", alpha=opacity)
ax2.errorbar(x_minus_1, pos_means_minus_1, yerr=pos_stds_minus_1, fmt='none', ecolor='black')

# Set the x-axis labels
ax1.set_xticks([p + (n_objectives - 1) * bar_width / 2 for p in pos])
ax1.set_xticklabels(credibility_signals, rotation=45, ha='right', fontsize=20)
# ax1.set_ylabel('Average Percentage', fontsize=20)
ax1.set_title("Non-Misinformation Articles", fontsize=25)

# Set the x-axis labels
ax2.set_xticks([p + (n_objectives - 1) * bar_width / 2 for p in pos])
ax2.set_xticklabels(credibility_signals, rotation=45, ha='right', fontsize=20)
ax2.set_xlabel('Credibility Signal', fontsize=20)
# ax2.set_ylabel('Mean', fontsize=20)
ax2.set_title("Misinformation Articles", fontsize=25)

ax1.tick_params(axis='y', labelsize=20)
ax2.tick_params(axis='y', labelsize=20)

# Create a single legend for both subplots
handles1, labels1 = ax1.get_legend_handles_labels()
handles2, labels2 = ax2.get_legend_handles_labels()
handles = handles1 + handles2
labels = labels1
legend = ax1.legend(handles, labels, title='Vote', fontsize=15, ncol=len(objectives), bbox_to_anchor=(0.5, 1.35), loc='upper center')
legend.get_title().set_fontsize(20)

def percent_formatter(x, pos):
    return f"{int(x)}%"

ax1.yaxis.set_major_formatter(FuncFormatter(percent_formatter))
ax2.yaxis.set_major_formatter(FuncFormatter(percent_formatter))

# Adjust the spacing between subplots
plt.tight_layout()
plt.show()

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.ticker import FuncFormatter
import matplotlib.patches as mpatches


# Define objectives and data
objectives = ['No', 'Yes', 'Unsure']
credibility_signals = df.iloc[:, :19].columns
# Change to Polarizing Language to British English
credibility_signals = [signal.replace("Polarizing", "Polarising") for signal in credibility_signals]

n_signals = len(credibility_signals)

# Combine data into a single structure
misinformation_means = [pos_means_0, pos_means_1, pos_means_minus_1]
non_misinformation_means = [neg_means_0, neg_means_1, neg_means_minus_1]

# Plot parameters
bar_width = 0.4
index = np.arange(n_signals)

# Create the plot
fig, ax = plt.subplots(figsize=(20, 10))

# Plotting the "Non-Misinformation" stacked bars with a solid pattern
ax.bar(index - bar_width/2, non_misinformation_means[0], bar_width, label='Non-Misinformation - No', alpha=0.7, color='red')
ax.bar(index - bar_width/2, non_misinformation_means[1], bar_width, label='Non-Misinformation - Yes', alpha=0.7, color='green', bottom=non_misinformation_means[0])
ax.bar(index - bar_width/2, non_misinformation_means[2], bar_width, label='Non-Misinformation - Unsure', alpha=0.7, color='grey', bottom=np.add(non_misinformation_means[0], non_misinformation_means[1]))

# Plotting the "Misinformation" stacked bars with a dashed edge pattern
ax.bar(index + bar_width/2, misinformation_means[0], bar_width, label='Misinformation - No', alpha=0.7, color='red', hatch='//')
ax.bar(index + bar_width/2, misinformation_means[1], bar_width, label='Misinformation - Yes', alpha=0.7, color='green', hatch='//', bottom=misinformation_means[0])
ax.bar(index + bar_width/2, misinformation_means[2], bar_width, label='Misinformation - Unsure', alpha=0.7, color='grey', hatch='//', bottom=np.add(misinformation_means[0], misinformation_means[1]))

# Set x-ticks and labels
ax.set_xticks(index)
ax.set_xticklabels(credibility_signals, rotation=45, ha='right', fontsize=20)

# Set titles and labels
# ax.set_title("Comparison of Credibility Signals", fontsize=25, pad=50)  # Add padding to title
ax.set_xlabel('Credibility Signal', fontsize=20)
ax.set_ylabel('Average Percentage', fontsize=20)
ax.tick_params(axis='y', labelsize=20)

# Adjust layout to make space for the legend below the title
# plt.subplots_adjust(top=1.8)

# Simplified legend
handles = [
    mpatches.Patch( facecolor="red", label='No', edgecolor='black'),
    mpatches.Patch( facecolor="green", label='Yes', edgecolor='black'),
    mpatches.Patch( facecolor="grey", label='Unsure', edgecolor='black'),
    mpatches.Patch( facecolor="white", label='Non-Misinformation', edgecolor='black'),
    mpatches.Patch( facecolor="white", hatch=r'\\\\', label='Misinformation', edgecolor='black'),
]
legend = ax.legend(handles=handles, title='Legend', fontsize=15, ncol=2, bbox_to_anchor=(0.5, 1.2), loc='upper center')
legend.get_title().set_fontsize(20)

# Format y-axis as percentages
def percent_formatter(x, pos):
    return f"{int(x)}%"

ax.yaxis.set_major_formatter(FuncFormatter(percent_formatter))

# Save the figure
plt.savefig("../outputs/avg_signal_distributions.pdf", format="pdf", bbox_inches="tight")

plt.show()

In [6]:
def chisquare_test(df_neg, df_pos):
    # Replace -1 with 0 and sum along axis 0 to get counts for each signal
    neg_count = df_neg.replace(-1, 0).to_numpy().sum(axis=0)
    pos_count = df_pos.replace(-1, 0).to_numpy().sum(axis=0)

    # Store results
    results = []
    for signal, observed_neg, observed_pos in zip(signals, neg_count, pos_count):
        # Combine the observed counts for chi-square
        observed = np.array([observed_neg, observed_pos])

        # Perform chi-square test
        chi2, p = chisquare(f_obs=observed)

        # Store the results
        results.append((signal, chi2, p))
    
    return results

In [None]:
results = []
for dataset, neg, pos in zip(datasets, all_neg, all_pos):
    result = chisquare_test(neg, pos)
    print(f"Dataset: {dataset}")
    # build dataset with result
    result = pd.DataFrame(result, columns=["Signal", "Chi2", "p-value"])

    # add column to indicate if p-value is less than 0.05
    result["p < 0.05"] = result["p-value"] < 0.05

    # format the p-values to the 2nd nearest decimal before the exponent
    result["p-value"] = result["p-value"].apply(lambda x: f"{x:.2e}")

    # format the chi-square statistic to the 2nd nearest decimal
    result["Chi2"] = result["Chi2"].apply(np.round, args=(2,))
    
    # normalise the chi2 statistic to the range [0, 1]
    result["Chi2"] = result["Chi2"] / result["Chi2"].max()

    result["dataset"] = dataset

    results.append(result)

results = pd.concat(results)
results

In [None]:
results.loc[results["Signal"] == "Polarizing Language", "Signal"] = "Polarising Language"  # Change to British English
results.loc[results["dataset"] == "gossipcop", "dataset"] = "GossipCop"
results.loc[results["dataset"] == "fakenewsamt", "dataset"] = "FakeNewsAMT"
results.loc[results["dataset"] == "politifact", "dataset"] = "PolitiFact"
results.loc[results["dataset"] == "celebrity", "dataset"] = "Celebrity"

# Create the heatmap DataFrame
chi2_heatmap = results.pivot(index="dataset", columns="Signal", values="Chi2")

# Compute the average row
average_row = chi2_heatmap.mean()
politics_avg = chi2_heatmap.loc[["FakeNewsAMT", "PolitiFact"], :].mean()
entertainment_avg = chi2_heatmap.loc[["Celebrity", "GossipCop"], :].mean()

# Add a blank row before the 'Average' row
chi2_heatmap.loc[" "] = np.nan  # Add a blank row

chi2_heatmap.loc["Politics", :] = politics_avg
chi2_heatmap.loc["Entertainment", :] = entertainment_avg
chi2_heatmap.loc["All", :] = average_row

# Extract the p-value DataFrame
pvalues = results.pivot(index="dataset", columns="Signal", values="p < 0.05")
pvalues.loc[" "] = False  # Add a blank row
pvalues.loc["Politics", :] = pvalues.loc[["FakeNewsAMT", "PolitiFact"]].all()
pvalues.loc["Entertainment", :] = pvalues.loc[["Celebrity", "GossipCop"]].all()
pvalues.loc["All", :] = pvalues.loc[["Celebrity", "GossipCop", "FakeNewsAMT", "PolitiFact"]].all()

# Extract the 'Average' row and sort by its values
sorted_indices = chi2_heatmap.loc["All"].sort_values(ascending=False).index

# Reorder the DataFrame based on the sorted indices
chi2_heatmap = chi2_heatmap[sorted_indices]  # Sort columns
chi2_heatmap = chi2_heatmap.loc[chi2_heatmap.index]  # Ensure row order is maintained
pvalues = pvalues[sorted_indices]  # Align pvalues DataFrame with sorted chi2_heatmap

# Plot the heatmap
plt.figure(figsize=(10, 8))
ax = sns.heatmap(chi2_heatmap, square=True, cmap="Reds", annot=True, fmt=".2f", cbar=False, cbar_kws={"size": 5}, annot_kws={"va": "top"})

# Add stars for significant values
for i in range(pvalues.shape[0]):
    for j in range(pvalues.shape[1]):
        if pvalues.iloc[i, j]:  # If the p-value is significant
            ax.text(j + 0.5, i + 0.5, '*', ha='center', va='bottom', color='black', fontsize=10, fontweight='bold')

# Adjust y-ticks to include the blank row
ax.set_yticks(np.arange(len(chi2_heatmap.index)) + 0.5)
ax.set_yticklabels(chi2_heatmap.index, rotation=0)

# Get the y-ticks and hide the one corresponding to the blank row
yticks = ax.yaxis.get_major_ticks()
yticks[-4].set_visible(False)  # Hides the tick corresponding to the blank row

# Rotate the x-ticks
plt.xticks(rotation=45, ha='right')

# Remove both axis labels
plt.xlabel(None)
plt.ylabel(None)

# Adjust the y-axis to clearly show the blank space
# plt.yticks(rotation=0)
plt.tight_layout()

# plt.savefig("outputs/chi2.pdf", format="pdf", bbox_inches="tight")

plt.show()

## Ablation Study

In [None]:
signals_sorted_by_chi2 = list(chi2_heatmap.columns)
signals_sorted_by_chi2 = [signal.replace("Polarising Language", "Polarizing Language") for signal in signals_sorted_by_chi2]
signals_sorted_by_chi2

In [None]:
SEED = 42
def get_train_dev_test_fold(fold, dataset, model_size, model_name="llama2_platypus", num_splits=10):
    assert fold < num_splits
    
    dataset_path = f"../data/signals/{dataset}.csv"
    df = pd.read_csv(dataset_path)
    skf = StratifiedKFold(n_splits=num_splits, shuffle=True, random_state=SEED)
    for j, (train_idxs, test_idxs) in enumerate(skf.split(range(len(df)), y=df["objective_true"].to_numpy())):
        train_df, test_df = df.iloc[train_idxs], df.iloc[test_idxs]
        print(len(train_df)/len(df), len(test_df)/len(df))

        if fold == j:
            return train_df, test_df


def predict_majority(row):
    if len(row) == 1:
        return row[0] if row[0] != -1 else np.random.choice([0, 1])
    else:
        # If there is a tie, randomly choose a class, else return the majority class
        counts = row.value_counts().to_dict()
        # get key with highest value
        if -1 in counts:
            del counts[-1]
        
        if len(counts) == 0:
            return np.random.choice([0, 1])
        else:
            return max(counts, key=counts.get)



signals_sorted_by_chi2 = list(chi2_heatmap.columns)
signals_sorted_by_chi2 = [signal.replace("Polarising Language", "Polarizing Language") for signal in signals_sorted_by_chi2]
signals_sorted_by_chi2.reverse()
print(signals_sorted_by_chi2)

all = []
best_signals_per_dataset = {}
for dataset in ["politifact", "fakenewsamt", "celebrity", "gossipcop"]:
    df = pd.read_csv(f"../data/signals/{dataset}.csv")

    # cross validation loop
    sf = StratifiedKFold(n_splits=10, shuffle=True, random_state=SEED)
    fold = 0
    for train_index, test_index in sf.split(df, df["objective_true"]):
        df_train, df_test = df.iloc[train_index], df.iloc[test_index]

        scores_by_num_signals = []
        
        random.seed()
        y_test_gold = df_test["objective_true"].to_numpy()

        for i in range(1,20):
            selected_signals = signals_sorted_by_chi2[:i]
            L_ws_train = df_train.loc[:, selected_signals].to_numpy()
            L_ws_test = df_test.loc[:, selected_signals].to_numpy()

            label_model = LabelModel(cardinality=2, device="cpu", verbose=False)
            if i < 3:  # snorkel does not allow less than 3 signals, so append two columns with abstentions
                L_ws_train = np.concatenate([L_ws_train, np.zeros((len(L_ws_train), 3-i))-1], axis=1)
                L_ws_test = np.concatenate([L_ws_test, np.zeros((len(L_ws_test), 3-i))-1], axis=1)

            label_model.fit(L_ws_train, n_epochs=500, seed=SEED, progress_bar=False)
            y_pred_ws = label_model.predict(L=L_ws_test, tie_break_policy="random")
            val_f1_macro = f1_score(y_test_gold, y_pred_ws, average='macro', zero_division=0)

            d = {"dataset": dataset, "fold":fold, "f1": val_f1_macro, "#signals": i}
            all.append(d)

        fold += 1

# calculate the mean and stf of the f1 scores for each dataset and number of signals
df = pd.DataFrame(all)

In [None]:
signals_sorted_by_chi2.reverse()
df_grouped = df.groupby(["dataset", "#signals"]).mean().reset_index()
df_grouped["std"] = df.groupby(["dataset", "#signals"]).std().reset_index()["f1"]
fig, ax = plt.subplots(figsize=(9, 5))

df_grouped.loc[df_grouped["dataset"] == "politifact", "dataset"] = "PolitiFact"
df_grouped.loc[df_grouped["dataset"] == "gossipcop", "dataset"] = "GossipCop"
df_grouped.loc[df_grouped["dataset"] == "fakenewsamt", "dataset"] = "FakeNewsAMT"
df_grouped.loc[df_grouped["dataset"] == "celebrity", "dataset"] = "Celebrity"

df_grouped = df_grouped[["dataset", "#signals", "f1", "std"]]

# Get unique datasets
unique_datasets = list(df_grouped["dataset"].unique())
fontsize=15
# Plot each dataset with error bars
for dataset in unique_datasets:
    subset = df_grouped[df_grouped["dataset"] == dataset]
    # ax.errorbar(subset["#signals"], subset["mu"], yerr=subset["std_err"], label=dataset)
    ax.plot(subset["#signals"], subset["f1"], label=dataset, linewidth=2)

# ax.set_xlabel('Signals', fontsize=fontsize)
ax.set_ylabel("F1 Macro", fontsize=fontsize)
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
# ax.set_xlim(3, max(df["#signals"]))
ax.set_xlim(max(df["#signals"]), 1)
# ax.set_xticks([3] + list(df["#signals"]))
ax.tick_params(axis='y', labelsize=fontsize)
ax.tick_params(axis='x', labelsize=10)
ax.set_ylim(0.0, 1.0)
plt.yticks(np.arange(0.0, 1.0, 0.1))


# add the signal names to the x axis
ax.set_xticks(range(1, 20))


# fix polarizing language to polarising language
signals_sorted_by_chi2 = [signal.replace("Polarizing Language", "Polarising Language") for signal in signals_sorted_by_chi2]

ax.set_xticklabels([f"({i+1}) - {v}" for i, v in enumerate(signals_sorted_by_chi2)], rotation=45, ha='right', fontsize=fontsize-5)



# plt.xticks(rotation=90, ha='right')
legend = ax.legend(fontsize=fontsize-5)
legend.get_title().set_fontsize(fontsize) 
ax.grid(True)

plt.tight_layout()
plt.savefig(f"signal_ablation.pdf", format="pdf", bbox_inches="tight")
plt.show()

In [12]:
signals_sorted_by_chi2 = list(chi2_heatmap.columns)
# signals_sorted_by_chi2 = [signal for signal in signals_sorted_by_chi2 if signal != "Inference"]
signals_sorted_by_chi2.reverse()
signals_sorted_by_chi2.append(" ")
# signals_sorted_by_chi2.insert(0, " ")

In [None]:
df_grouped = df.groupby(["dataset", "#signals"]).mean().reset_index()
df_grouped["std"] = df.groupby(["dataset", "#signals"]).std().reset_index()["f1"]
fig, ax = plt.subplots(figsize=(9, 5))

df_grouped.loc[df_grouped["dataset"] == "politifact", "dataset"] = "PolitiFact"
df_grouped.loc[df_grouped["dataset"] == "gossipcop", "dataset"] = "GossipCop"
df_grouped.loc[df_grouped["dataset"] == "fakenewsamt", "dataset"] = "FakeNewsAMT"
df_grouped.loc[df_grouped["dataset"] == "celebrity", "dataset"] = "Celebrity"

df_grouped = df_grouped[["dataset", "#signals", "f1", "std"]]

# Get unique datasets
unique_datasets = list(df_grouped["dataset"].unique())
fontsize = 15

# Plot each dataset with error bars
for dataset in unique_datasets:
    subset = df_grouped[df_grouped["dataset"] == dataset]
    ax.plot(subset["#signals"], subset["f1"], label=dataset, linewidth=2, alpha=0.8)

average_f1 = df_grouped.groupby("#signals")["f1"].mean()
ax.plot(average_f1.index, average_f1.values, label='Mean', linewidth=2, linestyle='--', color='black', alpha=1)

# Annotate the average line values
# for i, (x, y) in enumerate(zip(average_f1.index, average_f1.values)):
#     ax.text(x-0.3, y+0.03, f"{y:.3f}".lstrip('0'), color="black", fontsize=fontsize-3, ha='center', va='bottom')

ax.set_ylabel("F1 Macro", fontsize=fontsize)
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)

# Reverse the X-axis
ax.set_xlim(max(df["#signals"]), 1)

# Set X-ticks and labels
ax.set_xticks(range(0, 20))

# Reverse the order of signal names for labels
signals_sorted_by_chi2 = [signal.replace("Polarizing Language", "Polarising Language") for signal in signals_sorted_by_chi2]
ax.set_xticklabels([f"({i}) - {v}" for i, v in enumerate(signals_sorted_by_chi2)], rotation=45, ha='right', fontsize=fontsize-5)

ax.tick_params(axis='y', labelsize=fontsize)
ax.tick_params(axis='x', labelsize=10)
ax.set_ylim(0.3, 1.0)
plt.yticks(np.arange(0.3, 1.0, 0.1))

legend = ax.legend(fontsize=fontsize-5)
legend.get_title().set_fontsize(fontsize)
ax.grid(True)

plt.tight_layout()
plt.savefig(f"signal_ablation.pdf", format="pdf", bbox_inches="tight")
plt.show()

In [None]:
df_ablation = df_grouped.groupby("#signals")["f1"].mean()
# add the amount of improvement from the previous signal, starts with zero
df_ablation = df_ablation.reset_index()
df_ablation = pd.concat([pd.DataFrame([[0, 0.0]], columns=["#signals", "f1"]), df_ablation])
df_ablation = df_ablation.reset_index(drop=True)
# insert a row with #signals=0 and f1 = 0
df_ablation["Credibility Signal Removed"] = signals_sorted_by_chi2
df_ablation = df_ablation.sort_index(ascending=False)
df_ablation["Decrease"] = df_ablation["f1"].diff().fillna(0)
df_ablation["Decrease"] = df_ablation["Decrease"] * 100
df_ablation["Decrease"] = df_ablation["Decrease"].apply(lambda x: f"{np.round(x,1):.1f}%")
df_ablation["Avg. F1-Macro"] = df_ablation["f1"].apply(lambda x: f"{np.round(x,3):.3f}")
df_ablation["Iteration"] = np.arange(0,20)
df_ablation[["Iteration", "Credibility Signal Removed", "Avg. F1-Macro", "Decrease"]]

In [None]:
a = "Inference"
b = "Call to Action"

test = pd.concat(all_dfs)
test = test[(test[a] == 1) | (test[b] == 1)]
test["equal"] = test[a] == test[b]
test["equal"].sum() / len(test)

In [18]:
df_all = pd.DataFrame()
signals_sorted_by_chi2.pop()
for i, dataset in enumerate(["PolitiFact", "FakeNewsAMT", "Celebrity", "GossipCop"]):
    df = df_grouped[df_grouped["dataset"] == dataset]
    df = df.reset_index()
    df["Credibility Signal added"] = signals_sorted_by_chi2
    df["Improvement"] = df["f1"].diff().fillna(0)
    df["Improvement"] = df["Improvement"] * 100
    df["Improvement"] = df["Improvement"].apply(lambda x: f"{np.round(x,1):.1f}%")
    df[dataset] = df["f1"].apply(lambda x: f"{np.round(x,3):.3f}")
    df["Iteration"] = np.arange(1,20)

    if i > 0:
        df = df[[dataset, "Improvement"]]
    else:
        df = df[["Iteration", "Credibility Signal added", dataset, "Improvement"]].set_index("Iteration")

    df_all = pd.concat([df_all, df], axis=1)

In [19]:
SEED = 42
def get_train_dev_test_fold(fold, dataset, model_size, model_name="llama2_platypus", num_splits=10):
    assert fold < num_splits
    
    dataset_path = f"../data/signals/{dataset}.csv"
    df = pd.read_csv(dataset_path)
    skf = StratifiedKFold(n_splits=num_splits, shuffle=True, random_state=SEED)
    for j, (train_idxs, test_idxs) in enumerate(skf.split(range(len(df)), y=df["objective_true"].to_numpy())):
        train_df, test_df = df.iloc[train_idxs], df.iloc[test_idxs]
        print(len(train_df)/len(df), len(test_df)/len(df))

        if fold == j:
            return train_df, test_df


def predict_majority(row):
    if len(row) == 1:
        return row[0] if row[0] != -1 else np.random.choice([0, 1])
    else:
        # If there is a tie, randomly choose a class, else return the majority class
        counts = row.value_counts().to_dict()
        # get key with highest value
        if -1 in counts:
            del counts[-1]
        
        if len(counts) == 0:
            return np.random.choice([0, 1])
        else:
            return max(counts, key=counts.get)



signals_sorted_by_chi2 = list(chi2_heatmap.columns)
signals_sorted_by_chi2 = [signal.replace("Polarising Language", "Polarizing Language") for signal in signals_sorted_by_chi2]
signals_sorted_by_chi2.reverse()

all = []
best_signals_per_dataset = {}
for dataset in ["politifact", "fakenewsamt", "celebrity", "gossipcop"]:
    df = pd.read_csv(f"../data/signals/{dataset}.csv")

    # cross validation loop
    sf = StratifiedKFold(n_splits=10, shuffle=True, random_state=SEED)
    fold = 0
    for train_index, test_index in sf.split(df, df["objective_true"]):
        df_train, df_test = df.iloc[train_index], df.iloc[test_index]

        scores_by_num_signals = []
        
        random.seed()
        y_test_gold = df_test["objective_true"].to_numpy()

         
        selected_signals = [signal for signal in signals_sorted_by_chi2 if signal != "Misleading about content"]
        L_ws_train = df_train.loc[:, selected_signals].to_numpy()
        L_ws_test = df_test.loc[:, selected_signals].to_numpy()

        label_model = LabelModel(cardinality=2, device="cpu", verbose=False)
        label_model.fit(L_ws_train, n_epochs=500, seed=SEED, progress_bar=False)
        y_pred_ws = label_model.predict(L=L_ws_test, tie_break_policy="random")
        val_f1_macro = f1_score(y_test_gold, y_pred_ws, average='macro', zero_division=0)

        d = {"dataset": dataset, "fold":fold, "f1": val_f1_macro, "#signals": i}
        all.append(d)

        fold += 1

# calculate the mean and stf of the f1 scores for each dataset and number of signals
df = pd.DataFrame(all)

In [None]:
SEED = 42
def get_train_dev_test_fold(fold, dataset, model_size, model_name="llama2_platypus", num_splits=10):
    assert fold < num_splits
    
    dataset_path = f"../data/signals/{dataset}.csv"
    df = pd.read_csv(dataset_path)
    skf = StratifiedKFold(n_splits=num_splits, shuffle=True, random_state=SEED)
    for j, (train_idxs, test_idxs) in enumerate(skf.split(range(len(df)), y=df["objective_true"].to_numpy())):
        train_df, test_df = df.iloc[train_idxs], df.iloc[test_idxs]
        print(len(train_df)/len(df), len(test_df)/len(df))

        if fold == j:
            return train_df, test_df


def predict_majority(row):
    if len(row) == 1:
        return row[0] if row[0] != -1 else np.random.choice([0, 1])
    else:
        # If there is a tie, randomly choose a class, else return the majority class
        counts = row.value_counts().to_dict()
        # get key with highest value
        if -1 in counts:
            del counts[-1]
        
        if len(counts) == 0:
            return np.random.choice([0, 1])
        else:
            return max(counts, key=counts.get)



signals_sorted_by_chi2 = list(chi2_heatmap.columns)
signals_sorted_by_chi2 = [signal.replace("Polarising Language", "Polarizing Language") for signal in signals_sorted_by_chi2]
signals_sorted_by_chi2.reverse()

all = []
best_signals_per_dataset = {}
for dataset in ["politifact", "fakenewsamt", "celebrity", "gossipcop"]:
    df = pd.read_csv(f"../data/signals/{dataset}.csv")

    # cross validation loop
    sf = StratifiedKFold(n_splits=10, shuffle=True, random_state=SEED)
    fold = 0
    for train_index, test_index in sf.split(range(len(df)), df["objective_true"]):
        df_train, df_test = df.iloc[train_index], df.iloc[test_index]

        scores_by_num_signals = []
        
        random.seed()
        y_test_gold = df_test["objective_true"].to_numpy()


        # Train with all signals
        L_ws_train = df_train.loc[:, signals_sorted_by_chi2].to_numpy()
        L_ws_test = df_test.loc[:, signals_sorted_by_chi2].to_numpy()

        label_model = LabelModel(cardinality=2, device="cpu", verbose=False)
        label_model.fit(L_ws_train, n_epochs=500, seed=SEED, progress_bar=False)
        y_pred_ws = label_model.predict(L=L_ws_test, tie_break_policy="random")
        baseline = f1_score(y_test_gold, y_pred_ws, average='macro', zero_division=0)

        for i in range(1,20):
            selected_signals = [signal for j, signal in enumerate(signals_sorted_by_chi2) if j+1 != i]
            print("Removed signal", signals_sorted_by_chi2[i-1])
            L_ws_train = df_train.loc[:, selected_signals].to_numpy()
            L_ws_test = df_test.loc[:, selected_signals].to_numpy()

            label_model = LabelModel(cardinality=2, device="cpu", verbose=False)
            label_model.fit(L_ws_train, n_epochs=500, seed=SEED, progress_bar=False)
            y_pred_ws = label_model.predict(L=L_ws_test, tie_break_policy="random")
            val_f1_macro = f1_score(y_test_gold, y_pred_ws, average='macro', zero_division=0)

            d = {"dataset": dataset, "fold":fold, "f1": val_f1_macro, "baseline_f1": baseline, "signal removed": signals_sorted_by_chi2[i-1]}
            print(baseline-val_f1_macro)
            all.append(d)

        

        fold += 1

# calculate the mean and stf of the f1 scores for each dataset and number of signals
df = pd.DataFrame(all)

In [38]:
df_bank = df

In [39]:
df = df.groupby(["dataset", "signal removed"]).mean().reset_index()

In [None]:
# calculate the percentage decrease from the column "f1" with respect to the column "baseline_f1" indicate with "-" if there was a decrease and "+" if there was an increase

df["change"] = (df["f1"] - df["baseline_f1"]) / df["baseline_f1"]
df = df[["dataset", "signal removed", "f1", "baseline_f1", "change"]]
df

In [41]:
# For each signal removed, display the decrease for each dataset
df = df.pivot(index="signal removed", columns="dataset", values="change")
df = df.reset_index()
# df = df[["signal removed", "FakeNewsAMT", "GossipCop", "PolitiFact", "Celebrity"]]
# df["mean"] = df.iloc[:,1:].mean(axis=1)
df["mean"] = df.iloc[:,1:].mean(axis=1)
df["entertainment"] = df[["celebrity", "gossipcop"]].mean(axis=1)
df["politics"] = df[["fakenewsamt", "politifact"]].mean(axis=1)

In [42]:
df = df.sort_values("mean", ascending=True)

# format the values to 2 decimal places, multiply by 100 and add the percentage sign
df["mean"] = df["mean"].apply(lambda x: f"{np.round(x*100,1):.1f}%")
df["celebrity"] = df["celebrity"].apply(lambda x: f"{np.round(x*100,1):.1f}%")
df["fakenewsamt"] = df["fakenewsamt"].apply(lambda x: f"{np.round(x*100,1):.1f}%")
df["gossipcop"] = df["gossipcop"].apply(lambda x: f"{np.round(x*100,1):.1f}%")
df["politifact"] = df["politifact"].apply(lambda x: f"{np.round(x*100,1):.1f}%")
df["politics"] = df["politics"].apply(lambda x: f"{np.round(x*100,1):.1f}%")
df["entertainment"] = df["entertainment"].apply(lambda x: f"{np.round(x*100,1):.1f}%")

In [None]:
df

In [None]:
df_grouped = df.groupby(["dataset", "#signals"]).mean().reset_index()
df_grouped["std"] = df.groupby(["dataset", "#signals"]).std().reset_index()["f1"]

df_grouped.loc[df_grouped["dataset"] == "politifact", "dataset"] = "PolitiFact"
df_grouped.loc[df_grouped["dataset"] == "gossipcop", "dataset"] = "GossipCop"
df_grouped.loc[df_grouped["dataset"] == "fakenewsamt", "dataset"] = "FakeNewsAMT"
df_grouped.loc[df_grouped["dataset"] == "celebrity", "dataset"] = "Celebrity"

df_grouped = df_grouped[["dataset", "#signals", "f1", "baseline_f1", "std"]]
df_ablation = df_grouped.groupby("#signals").agg({"f1": "mean", "baseline_f1": "mean"})
df_ablation["% decrease"] = ((df_ablation["f1"] - df_ablation["baseline_f1"]) / df_ablation["baseline_f1"]) * 100
df_ablation = df_ablation.reset_index(drop=True)
df_ablation["Credibility Signal Removed"] = signals_sorted_by_chi2
df_ablation

In [None]:
df_grouped = df.groupby(["dataset", "#signals"]).mean().reset_index()
df_grouped["std"] = df.groupby(["dataset", "#signals"]).std().reset_index()["f1"]

df_grouped.loc[df_grouped["dataset"] == "politifact", "dataset"] = "PolitiFact"
df_grouped.loc[df_grouped["dataset"] == "gossipcop", "dataset"] = "GossipCop"
df_grouped.loc[df_grouped["dataset"] == "fakenewsamt", "dataset"] = "FakeNewsAMT"
df_grouped.loc[df_grouped["dataset"] == "celebrity", "dataset"] = "Celebrity"

df_grouped = df_grouped[["dataset", "#signals", "f1", "std"]]
df_ablation = df_grouped.groupby("#signals")["f1"].mean()
# add the amount of improvement from the previous signal, starts with zero
df_ablation = df_ablation.reset_index()
# df_ablation = pd.concat([pd.DataFrame([[0, 0.0]], columns=["#signals", "f1"]), df_ablation])
# df_ablation = df_ablation.reset_index(drop=True)
# insert a row with #signals=0 and f1 = 0
df_ablation["Credibility Signal Removed"] = signals_sorted_by_chi2
df_ablation = df_ablation.sort_index(ascending=False)
df_ablation["Decrease"] = df_ablation["f1"].diff().fillna(0)
df_ablation["Decrease"] = df_ablation["Decrease"] * 100
df_ablation["Decrease"] = df_ablation["Decrease"].apply(lambda x: f"{np.round(x,1):.1f}%")
df_ablation["Avg. F1-Macro"] = df_ablation["f1"].apply(lambda x: f"{np.round(x,3):.3f}")
df_ablation["Iteration"] = np.arange(1,20)
df_ablation[["Iteration", "Credibility Signal Removed", "Avg. F1-Macro", "Decrease"]].set_index("Iteration")