In [None]:
import pandas as pd
from sklearn.model_selection import StratifiedKFold, train_test_split
from snorkel.labeling.model import LabelModel
from snorkel.labeling import LFAnalysis
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, precision_score, recall_score
import matplotlib.pyplot as plt
import random
import os
import numpy as np
import math

In [None]:
SEED = 42

In [None]:
# N choose K over all signals
total = 0
n = 19
for k in range(3, n+1):
    k_fac = math.factorial(k)
    num_combinations = math.factorial(19)/(math.factorial(k) * math.factorial(n-k))

    total += num_combinations
    print(k, num_combinations)

print("Total number of possibilities:", total)

In [None]:
def get_train_dev_test_fold(fold, dataset, model_size, model_name="llama2_platypus", num_splits=10):
    assert fold < num_splits
    
    dataset_path = f"../data/processed/{dataset}/{model_name}/{model_size}/{dataset}.csv"
    df = pd.read_csv(dataset_path)
    skf = StratifiedKFold(n_splits=num_splits, shuffle=True, random_state=SEED)
    for j, (train_idxs, test_idxs) in enumerate(skf.split(range(len(df)), y=df["objective_true"].to_numpy())):
        train_df, test_df = df.iloc[train_idxs], df.iloc[test_idxs]
        print(len(train_df)/len(df), len(test_df)/len(df))

        if fold == j:
            return train_df, test_df

In [None]:
all = []
best_signals_per_dataset = {}
for dataset in ["politifact", "fakenewsdataset", "celebritydataset", "gossipcop"]:
    print(dataset)
    best_score = 0
    best_signals = None

    df = pd.read_csv(f"../data/processed/{dataset}/llama2_platypus/70/{dataset}.csv")
    signals = list(df.iloc[:, :19].columns)
    df_train, df_test = train_test_split(df, train_size=0.5, random_state=SEED)
    df_dev, df_test = train_test_split(df_test, train_size=0.5, random_state=SEED)

    scores_by_num_signals = []
    
    random.seed()
    y_test_gold = df_test["objective_true"].to_numpy()
    y_dev_gold = df_dev["objective_true"].to_numpy()

    # Sort signals by their empirical accuracy on the dev set
    L_ws_dev = df_dev.loc[:, signals].to_numpy()
    dev_accs = LFAnalysis(L_ws_dev).lf_empirical_accuracies(y_dev_gold)
    signal_accs = {signal: acc for signal, acc in zip(signals, dev_accs)}
    sorted_signal_accs = sorted(signal_accs.items(), key=lambda x: x[1], reverse=True)
    signals_sorted = [signal for signal, _ in sorted_signal_accs]

    for i in range(3, 20):
        selected_signals = signals_sorted[:i]
        L_ws_train = df_train.loc[:, selected_signals].to_numpy()
        L_ws_test = df_test.loc[:, selected_signals].to_numpy()

        label_model = LabelModel(cardinality=2, device="cpu", verbose=False)
        label_model.fit(L_ws_train, n_epochs=500, seed=SEED, progress_bar=False)
        y_pred_ws = label_model.predict(L=L_ws_test, tie_break_policy="random")
        val_acc = accuracy_score(y_test_gold, y_pred_ws)
        val_f1_macro = f1_score(y_test_gold, y_pred_ws, average='macro')

        tn, fp, fn, tp = confusion_matrix(y_test_gold, y_pred_ws).ravel()
        false_positive_rate = fp / (fp + tn)
        true_negative_rate = tn / (tn + fp)
        false_negative_rate = fn / (fn + tp)
        true_positive_rate = tp / (tp + fn)

        precision = precision_score(y_test_gold, y_pred_ws)
        recall = recall_score(y_test_gold, y_pred_ws)
        # print(i, val_f1_macro)

        d = {"dataset": dataset, "f1": val_f1_macro, "#signals": i}
        all.append(d)

        if val_f1_macro > best_score:
            best_signals = selected_signals
            best_score = val_f1_macro

    best_signals_per_dataset[dataset] = best_signals

In [None]:
best_signals_per_dataset

In [None]:
df = pd.DataFrame(all)
fig, ax = plt.subplots(figsize=(9, 5))

# Get unique datasets
unique_datasets = df["dataset"].unique()

# Plot each dataset with error bars
for dataset in unique_datasets:
    subset = df[df["dataset"] == dataset]
    # ax.errorbar(subset["#signals"], subset["mu"], yerr=subset["std_err"], label=dataset)
    ax.plot(subset["#signals"], subset["f1"], label=dataset)
    # ax.fill_between(
    #     subset["#signals"],
    #     subset["mu"] - subset["std_err"],
    #     subset["mu"] + subset["std_err"],
    #     alpha=0.2
    # )

ax.set_xlabel('# Signals')
ax.set_ylabel("F1 Macro")
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.set_xlim(3, max(df["#signals"]))
ax.set_xticks([3] + list(df["#signals"]))
plt.legend(title='Dataset')
plt.show()

In [None]:
all = []
for dataset in ["politifact", "fakenewsdataset", "celebritydataset", "gossipcop"]:
    print(dataset)
    df = pd.read_csv(f"../data/processed/{dataset}/llama2_platypus/70/{dataset}.csv")
    signals = list(df.iloc[:, :19].columns)
    for i in range(3, 20):
        df_train, df_test = train_test_split(df, train_size=0.5, random_state=SEED)
        scores_by_num_signals = []
        y_test_gold = df_test["objective_true"].to_numpy()
        sample_size = 1000
        num_combinations = math.factorial(19)/(math.factorial(i) * math.factorial(19-i))
        min(num_combinations, sample_size)
        for _ in range(sample_size):
            random.seed()
            selected_signals = random.sample(signals, k=i)
            L_ws_train = df_train.loc[:, selected_signals].to_numpy()
            L_ws_test = df_test.loc[:, selected_signals].to_numpy()
            label_model = LabelModel(cardinality=2, device="cpu", verbose=False)
            label_model.fit(L_ws_train, n_epochs=500, seed=SEED, progress_bar=False)
            y_pred_ws = label_model.predict(L=L_ws_test, tie_break_policy="random")
            val_acc = accuracy_score(y_test_gold, y_pred_ws)
            val_f1_macro = f1_score(y_test_gold, y_pred_ws, average='macro')

            tn, fp, fn, tp = confusion_matrix(y_test_gold, y_pred_ws).ravel()
            false_positive_rate = fp / (fp + tn)
            true_negative_rate = tn / (tn + fp)
            false_negative_rate = fn / (fn + tp)
            true_positive_rate = tp / (tp + fn)

            precision = precision_score(y_test_gold, y_pred_ws)
            recall = recall_score(y_test_gold, y_pred_ws)

            scores_by_num_signals.append(val_f1_macro)

        print(i, np.mean(scores_by_num_signals), np.std(scores_by_num_signals))

        std_err = np.std(scores_by_num_signals, ddof=1) / np.sqrt(np.size(scores_by_num_signals))
        mu = np.mean(scores_by_num_signals)

        d = {"dataset": dataset, "mu": mu, "std_err": std_err, "#signals": i}
        all.append(d)

In [None]:
df = pd.DataFrame(all)
fig, ax = plt.subplots(figsize=(9, 5))

# Get unique datasets
unique_datasets = df["dataset"].unique()

# Plot each dataset with error bars
for dataset in unique_datasets:
    subset = df[df["dataset"] == dataset]
    # ax.errorbar(subset["#signals"], subset["mu"], yerr=subset["std_err"], label=dataset)
    ax.plot(subset["#signals"], subset["mu"], label=dataset)
    ax.fill_between(
        subset["#signals"],
        subset["mu"] - subset["std_err"],
        subset["mu"] + subset["std_err"],
        alpha=0.2
    )

ax.set_xlabel('# Signals')
ax.set_ylabel("F1 Macro")
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.set_xlim(3, max(df["#signals"]))
ax.set_xticks([3] + list(df["#signals"]))
plt.legend(title='Dataset')
plt.show()

In [None]:
df[df["#signals"].isin([3,19])]

In [None]:
df_avg = df.iloc[:, 1:].groupby("#signals").mean().reset_index()
df_avg["std_err"] = df.iloc[:, 1:].groupby("#signals").std().reset_index()["mu"]

fig, ax = plt.subplots(figsize=(9, 5))

# Plot each dataset with error bars
# ax.errorbar(df_avg["#signals"], df_avg["mu"], yerr=df_avg["std_err"])
ax.plot(df_avg["#signals"], df_avg["mu"])
ax.fill_between(
    df_avg["#signals"],
    df_avg["mu"] - df_avg["std_err"],
    df_avg["mu"] + df_avg["std_err"],
    alpha=0.2
)

ax.set_xlabel('# Signals')
ax.set_ylabel("F1 Macro")
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.set_xlim(3, max(df["#signals"]))
ax.set_xticks([3] + list(df["#signals"]))
plt.show()

In [None]:
# all combinations of 18 signals (remove a different signal in each iteration)
signals = list(df_train.iloc[:, :19].columns)
for signal in [""] + signals:
    scores_by_fold = []
    df_train, df_test = train_test_split(df, train_size=0.5, random_state=SEED)
    sampled_signals = [x for x in signals if x != signal]
    y_test_gold = df_test["objective_true"].to_numpy()

    L_ws_train = df_train.loc[:, sampled_signals].to_numpy()
    L_ws_test = df_test.loc[:, sampled_signals].to_numpy()
    label_model = LabelModel(cardinality=2, device="cpu", verbose=False)
    label_model.fit(L_ws_train, n_epochs=500, seed=SEED, progress_bar=False)
    y_pred_ws = label_model.predict(L=L_ws_test, tie_break_policy="random")
    val_acc = accuracy_score(y_test_gold, y_pred_ws)
    val_f1_macro = f1_score(y_test_gold, y_pred_ws, average='macro')

    tn, fp, fn, tp = confusion_matrix(y_test_gold, y_pred_ws).ravel()
    false_positive_rate = fp / (fp + tn)
    true_negative_rate = tn / (tn + fp)
    false_negative_rate = fn / (fn + tp)
    true_positive_rate = tp / (tp + fn)

    precision = precision_score(y_test_gold, y_pred_ws)
    recall = recall_score(y_test_gold, y_pred_ws)

    # print(i, np.mean(scores_by_num_signals), np.std(scores_by_num_signals))
    print(signal, val_f1_macro)