In [None]:
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from snorkel.labeling.model import LabelModel
from sklearn.metrics import f1_score
import matplotlib.pyplot as plt
import random
import numpy as np

In [None]:
SEED = 42

In [None]:
def get_train_dev_test_fold(fold, dataset, model_size, model_name="llama2_platypus", num_splits=10):
    assert fold < num_splits
    
    dataset_path = f"../data/processed/{dataset}/{model_name}/{model_size}/{dataset}.csv"
    df = pd.read_csv(dataset_path)
    skf = StratifiedKFold(n_splits=num_splits, shuffle=True, random_state=SEED)
    for j, (train_idxs, test_idxs) in enumerate(skf.split(range(len(df)), y=df["objective_true"].to_numpy())):
        train_df, test_df = df.iloc[train_idxs], df.iloc[test_idxs]
        print(len(train_df)/len(df), len(test_df)/len(df))

        if fold == j:
            return train_df, test_df

In [None]:
def predict_majority(row):
    if len(row) == 1:
        return row[0] if row[0] != -1 else np.random.choice([0, 1])
    else:
        # If there is a tie, randomly choose a class, else return the majority class
        counts = row.value_counts().to_dict()
        # get key with highest value
        if -1 in counts:
            del counts[-1]
        
        if len(counts) == 0:
            return np.random.choice([0, 1])
        else:
            return max(counts, key=counts.get)

In [None]:
signals_sorted_by_corr = ['Document Citation', 'Sensationalism', 'Misleading about content',
       'Evidence', 'Expert Citation', 'Emotional Valence',
       'Reported by Other Sources', 'Clickbait', 'Source Credibility', 'Bias',
       'Explicitly Unverified Claims', 'Polarizing Language', 'Informal Tone',
       'Incorrect Spelling', 'Incivility', 'Personal Perspective', 'Inference',
       'Impoliteness', 'Call to Action']

signals_sorted_by_corr.reverse()

all = []
best_signals_per_dataset = {}
for dataset in ["politifact", "fakenewsamt", "celebrity", "gossipcop"]:
    print(dataset)

    df = pd.read_csv(f"../data/signals/{dataset}.csv")

    # cross validation loop
    sf = StratifiedKFold(n_splits=10, shuffle=True, random_state=SEED)
    fold = 0
    for train_index, test_index in sf.split(df, df["objective_true"]):
        df_train, df_test = df.iloc[train_index], df.iloc[test_index]

        scores_by_num_signals = []
        
        random.seed()
        y_test_gold = df_test["objective_true"].to_numpy()

        for i in range(1,20):
            selected_signals = signals_sorted_by_corr[:i]
            L_ws_train = df_train.loc[:, selected_signals].to_numpy()
            L_ws_test = df_test.loc[:, selected_signals].to_numpy()

            label_model = LabelModel(cardinality=2, device="cpu", verbose=False)
            if i < 3:  # snorkel does not allow less than 3 signals, so append two columns with abstentions
                L_ws_train = np.concatenate([L_ws_train, np.zeros((len(L_ws_train), 3-i))-1], axis=1)
                L_ws_test = np.concatenate([L_ws_test, np.zeros((len(L_ws_test), 3-i))-1], axis=1)

            label_model.fit(L_ws_train, n_epochs=500, seed=SEED, progress_bar=False)
            y_pred_ws = label_model.predict(L=L_ws_test, tie_break_policy="random")
            val_f1_macro = f1_score(y_test_gold, y_pred_ws, average='macro', zero_division=0)

            d = {"dataset": dataset, "fold":fold, "f1": val_f1_macro, "#signals": i}
            all.append(d)

        fold += 1

In [None]:
# calculate the mean and stf of the f1 scores for each dataset and number of signals
df = pd.DataFrame(all)
df_grouped = df.groupby(["dataset", "#signals"]).mean().reset_index()
df_grouped["std"] = df.groupby(["dataset", "#signals"]).std().reset_index()["f1"]


In [None]:
fig, ax = plt.subplots(figsize=(9, 5))

df_grouped.loc[df_grouped["dataset"] == "politifact", "dataset"] = "PolitiFact"
df_grouped.loc[df_grouped["dataset"] == "gossipcop", "dataset"] = "GossipCop"
df_grouped.loc[df_grouped["dataset"] == "fakenewsamt", "dataset"] = "FakeNewsAMT"
df_grouped.loc[df_grouped["dataset"] == "celebrity", "dataset"] = "Celebrity"

# Get unique datasets
unique_datasets = df_grouped["dataset"].unique()
fontsize=20
# Plot each dataset with error bars
for dataset in unique_datasets:
    subset = df_grouped[df_grouped["dataset"] == dataset]
    # ax.errorbar(subset["#signals"], subset["mu"], yerr=subset["std_err"], label=dataset)
    ax.plot(subset["#signals"], subset["f1"], label=dataset, linewidth=2)

ax.set_xlabel('# Signals', fontsize=fontsize)
ax.set_ylabel("F1 Macro", fontsize=fontsize)
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.set_xlim(3, max(df["#signals"]))
ax.set_xticks([3] + list(df["#signals"]))
ax.tick_params(axis='y', labelsize=fontsize)
ax.tick_params(axis='x', labelsize=fontsize)
ax.set_ylim(0.30, 1.0)
plt.yticks(np.arange(0.30, 1.1, 0.1))
legend = ax.legend(fontsize=fontsize-5)
legend.get_title().set_fontsize(fontsize) 
ax.grid(True)

plt.tight_layout()
# plt.savefig(f"signal_ablation_sorted_corr_all.pdf", format="pdf", bbox_inches="tight")
plt.show()
