In [None]:
from snorkel.labeling import LFAnalysis
from snorkel.labeling.model import LabelModel
import pandas as pd
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.metrics import confusion_matrix

In [None]:
SEED = 42

def get_train_test_fold(fold, dataset, num_splits=10):
    assert fold < num_splits
    
    dataset_path = f"../data/signals/{dataset}.csv"
    df = pd.read_csv(dataset_path)
    skf = StratifiedKFold(n_splits=num_splits, shuffle=True, random_state=SEED)
    for j, (train_idxs, test_idxs) in enumerate(skf.split(range(len(df)), y=df["objective_true"].to_numpy())):
        train_df, test_df = df.iloc[train_idxs], df.iloc[test_idxs]

        if fold == j:
            return train_df, test_df

In [None]:
for dataset in ["politifact", "gossipcop", "celebrity", "fakenewsamt"]:
    mean_fnr = 0
    mean_fpr = 0
    mean_triggered_signals = np.zeros(19)
    confusion_matrices = []
    mean_f1 = 0
    for i in range(10):
        train_df, test_df = get_train_test_fold(i, dataset, 70)
        L_train = train_df.iloc[:, :19].to_numpy()
        y_train = train_df["objective_true"].to_numpy()
        L_test = test_df.iloc[:, :19].to_numpy()
        y_test = test_df["objective_true"].to_numpy()

        label_model = LabelModel(cardinality=2, verbose=False)
        label_model.fit(L_train, n_epochs=2000, log_freq=100, seed=SEED)
        prediction = label_model.predict(L_test, tie_break_policy="random")
        mean_fnr += ((prediction == 0) & (y_test == 1)).sum()/(y_test == 1).sum()
        non_zero_L = L_test.copy()
        non_zero_L[non_zero_L == -1] = 0
        non_zero_L

        mean_fpr += ((prediction == 1) & (y_test == 0)).sum()/(y_test == 0).sum()

        fn_triggered_signals = non_zero_L[((prediction == 0) & (y_test == 1))].sum(axis=0)  # triggered signals   
        label_model = LabelModel(cardinality=2, verbose=False)
        label_model.fit(L_train, n_epochs=500, log_freq=100, seed=SEED)
        prediction = label_model.predict(L_test, tie_break_policy="random")

        mean_f1 += f1_score(y_test, prediction, average="macro")
        cm = confusion_matrix(y_test, prediction)
        sum_of_entries = np.sum(cm)
        confusion_matrices.append(cm / sum_of_entries)

    average_confusion_matrix = np.mean(confusion_matrices, axis=0) * 100
    std_confusion_matrix = np.std(confusion_matrices, axis=0) * 100

    mean_triggered_signals += fn_triggered_signals

    mean_f1 /= 10
    mean_fnr /= 10
    mean_fpr /= 10
    mean_triggered_signals /= 10

    print(dataset)
    print("F1 Macro", mean_f1)
    print("FNR", mean_fnr)
    print("FPR", mean_fpr)
    labels = [f"{average_confusion_matrix[i][j]:.1f}%\n±{std_confusion_matrix[i][j]:.1f}%" for i in range(2) for j in range(2)]
    labels = [[labels[0], labels[1]], [labels[2], labels[3]]]

    plt.figure(figsize=(12,10))
    sns.set(font_scale=5.0)
    sns.heatmap(average_confusion_matrix, annot=labels, fmt="", cmap='Blues', vmin=0, vmax=100, annot_kws={"size": 60})
    if dataset == "politifact":
        plt.title("PolitiFact",fontweight="bold")
    elif dataset == "gossipcop":
        plt.title("GossipCop",fontweight="bold")
    elif dataset == "celebritydataset":
        plt.title("Celebrity",fontweight="bold")
    elif dataset == "fakenewsdataset":
        plt.title("FakeNewsAMT",fontweight="bold")

    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    # plt.savefig(f"../confusion_matrix_{dataset}.pdf", bbox_inches='tight', dpi=300)
    plt.show()


In [None]:
from sklearn.metrics import confusion_matrix


for dataset in ["politifact", "gossipcop", "celebrity", "fakenewsamt"]:
    mean_fnr = 0
    mean_fpr = 0
    mean_triggered_signals = np.zeros(19)
    confusion_matrices = []
    mean_f1 = 0
    for i in range(10):
        train_df, test_df = get_train_test_fold(i, dataset, 70)
        L_train = train_df.iloc[:, :19].to_numpy()
        y_train = train_df["objective_true"].to_numpy()
        L_test = test_df.iloc[:, :19].to_numpy()
        y_test = test_df["objective_true"].to_numpy()

        label_model = LabelModel(cardinality=2, verbose=False)
        label_model.fit(L_train, n_epochs=2000, log_freq=100, seed=SEED)
        prediction = label_model.predict(L_test, tie_break_policy="random")
        mean_fnr += ((prediction == 0) & (y_test == 1)).sum()/(y_test == 1).sum()
        non_zero_L = L_test.copy()
        non_zero_L[non_zero_L == -1] = 0
        non_zero_L

        mean_fpr += ((prediction == 1) & (y_test == 0)).sum()/(y_test == 0).sum()

        fn_triggered_signals = non_zero_L[((prediction == 0) & (y_test == 1))].sum(axis=0)  # triggered signals   
        label_model = LabelModel(cardinality=2, verbose=False)
        label_model.fit(L_train, n_epochs=500, log_freq=100, seed=SEED)
        prediction = label_model.predict(L_test, tie_break_policy="random")

        mean_f1 += f1_score(y_test, prediction, average="macro")
        cm = confusion_matrix(y_test, prediction)
        sum_of_entries = np.sum(cm)
        confusion_matrices.append(cm)

    average_confusion_matrix = np.mean(confusion_matrices, axis=0)
    std_confusion_matrix = np.std(confusion_matrices, axis=0)

    mean_triggered_signals += fn_triggered_signals

    mean_f1 /= 10
    mean_fnr /= 10
    mean_fpr /= 10
    mean_triggered_signals /= 10

    print(dataset)
    print("F1 Macro", mean_f1)
    print("FNR", mean_fnr)
    print("FPR", mean_fpr)
    labels = [f"{average_confusion_matrix[i][j]:.1f}\n±{std_confusion_matrix[i][j]:.1f}" for i in range(2) for j in range(2)]
    labels = [[labels[0], labels[1]], [labels[2], labels[3]]]

    plt.figure(figsize=(12,10))
    sns.set(font_scale=5.0)
    sns.heatmap(average_confusion_matrix, annot=labels, fmt="", cmap='Blues', vmin=average_confusion_matrix.min(), vmax=average_confusion_matrix.max(), annot_kws={"size": 60})
    if dataset == "politifact":
        plt.title("PolitiFact",fontweight="bold")
    elif dataset == "gossipcop":
        plt.title("GossipCop",fontweight="bold")
    elif dataset == "celebritydataset":
        plt.title("Celebrity",fontweight="bold")
    elif dataset == "fakenewsdataset":
        plt.title("FakeNewsAMT",fontweight="bold")

    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    # plt.savefig(f"../confusion_matrix_{dataset}.pdf", bbox_inches='tight', dpi=300)
    plt.show()
