In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
from collections import Counter, defaultdict

import numpy as np
import json
from wordcloud import WordCloud

from scipy.stats import norm
from scipy.spatial.distance import jensenshannon
import seaborn as sns

from sklearn.preprocessing import StandardScaler

from scipy.stats import pointbiserialr
from scipy.stats import fisher_exact

In [None]:
def rebuild_articles(df):
    combined_text = df.groupby("id_article")["text"].apply(" ".join).reset_index()

    combined_techniques = (
        df.groupby("id_article")["persuasion_techniques"]
        .apply(
            lambda x: list(
                set(technique for techniques in x.dropna() for technique in techniques)
            )
        )
        .reset_index()
    )

    combined_entities = (
        df.groupby("id_article")["entities"]
        .apply(
            lambda x: json.dumps(
                [
                    dict(t)
                    for t in {
                        tuple(d.items()) for entities in x.dropna() for d in entities
                    }
                ]
            )
        )
        .reset_index()
    )

    labels = df.groupby("id_article")["label"].first().reset_index()

    result = (
        combined_text.merge(combined_techniques, on="id_article")
        .merge(combined_entities, on="id_article")
        .merge(labels, on="id_article")
    )

    result["entities"] = result["entities"].apply(json.loads)

    return result

In [None]:
SEMEVAL_LABELS = [
    "Appeal_to_Authority",
    "Appeal_to_Fear-Prejudice",
    "Appeal_to_Hypocrisy",
    # "Appeal_to_Popularity",
    # "Appeal_to_Time",
    "Appeal_to_Values",
    "Causal_Oversimplification",
    # "Consequential_Oversimplification",
    "Conversation_Killer",
    "Doubt",
    "Exaggeration-Minimisation",
    "False_Dilemma-No_Choice",
    "Flag_Waving",
    "Guilt_by_Association",
    "Loaded_Language",
    "Name_Calling-Labeling",
    # "Obfuscation-Vagueness-Confusion",
    "Questioning_the_Reputation",
    # "Red_Herring",
    "Repetition",
    "Slogans",
    # "Straw_Man",
    # "Whataboutism",
]

In [None]:
def load_dataset(dataset_name):
    if dataset_name == "cidii":
        cidii_df = pd.read_csv("datasets/processed/cidii.csv")
        cidii_df["entities"] = cidii_df["entities"].apply(json.loads)
        cidii_df["persuasion_techniques"] = cidii_df["persuasion_techniques"].apply(
            lambda x: x.split(",") if isinstance(x, str) else []
        )
        cidii_df["persuasion_techniques"] = cidii_df["persuasion_techniques"].apply(
            lambda x: [t for t in x if t in SEMEVAL_LABELS]
        )
        cidii_df = cidii_df[cidii_df["label"] == 1].reset_index(drop=True)

        return cidii_df

    elif dataset_name == "covid":
        covid_df = pd.read_csv("datasets/processed/covid.csv")
        covid_df["entities"] = covid_df["entities"].apply(json.loads)
        covid_df["persuasion_techniques"] = covid_df["persuasion_techniques"].apply(
            lambda x: x.split(",") if isinstance(x, str) else []
        )
        covid_df["persuasion_techniques"] = covid_df["persuasion_techniques"].apply(
            lambda x: [t for t in x if t in SEMEVAL_LABELS]
        )
        covid_df = covid_df[covid_df["label"] == 1].reset_index(drop=True)

        return covid_df

    elif dataset_name == "climate_fever":
        climate_fever_df = pd.read_csv("datasets/processed/climate_fever.csv")
        climate_fever_df["entities"] = climate_fever_df["entities"].apply(json.loads)
        climate_fever_df["persuasion_techniques"] = climate_fever_df[
            "persuasion_techniques"
        ].apply(lambda x: x.split(",") if isinstance(x, str) else [])
        climate_fever_df["persuasion_techniques"] = climate_fever_df[
            "persuasion_techniques"
        ].apply(lambda x: [t for t in x if t in SEMEVAL_LABELS])
        climate_fever_df = climate_fever_df[climate_fever_df["label"] == 1].reset_index(
            drop=True
        )

        return climate_fever_df

    elif dataset_name == "euvsdisinfo":
        euvsdisinfo_df = pd.read_csv("datasets/processed/euvsdisinfo.csv")
        euvsdisinfo_df["entities"] = euvsdisinfo_df["entities"].apply(json.loads)
        euvsdisinfo_df["persuasion_techniques"] = euvsdisinfo_df[
            "persuasion_techniques"
        ].apply(lambda x: x.split(",") if isinstance(x, str) else [])
        euvsdisinfo_df["persuasion_techniques"] = euvsdisinfo_df[
            "persuasion_techniques"
        ].apply(lambda x: [t for t in x if t in SEMEVAL_LABELS])
        euvsdisinfo_df = euvsdisinfo_df[euvsdisinfo_df["label"] == 1].reset_index(
            drop=True
        )
        euvsdisinfo_df = euvsdisinfo_df[
            euvsdisinfo_df["keywords"]
            .apply(lambda x: x.split(",") if isinstance(x, str) else [])
            .apply(lambda x: "War in Ukraine" in x)
        ].reset_index(drop=True)

        return euvsdisinfo_df

    else:
        raise ValueError(f"Unknown dataset name: {dataset_name}")


cidii_df = load_dataset("cidii")
covid_df = load_dataset("covid")
climate_fever_df = load_dataset("climate_fever")
euvsdisinfo_df = load_dataset("euvsdisinfo")

In [None]:
def get_top_n_entities_per_persuasion_technique(df, technique, n=None):
    result = {}
    entities = df[df["persuasion_techniques"].apply(lambda x: technique in x)][
        "entities"
    ]

    entity_counter = Counter(
        (entity["text"].lower(), entity["label"])
        for entities in entities
        for entity in entities
        if entity["label"]
    )

    return [
        (entity[0], entity[1], count) for entity, count in entity_counter.most_common(n)
    ]


def generate_wordcloud_data(entities_data):
    """
    Prepares data for the word cloud, organizing entities by type and frequency.
    """
    word_freq_by_type = defaultdict(dict)
    for item in entities_data:
        if isinstance(item, tuple) and len(item) == 3:
            entity, label, count = item
            word_freq_by_type[label][entity] = count
        else:
            print(f"Skipping item due to incorrect format: {item}")
    return word_freq_by_type


def assign_colors_to_labels(labels):
    """
    Assigns unique colors to each label across all subplots.
    """
    color_palette = [
        "#E63946",  # Bright Red
        "#457B9D",  # Teal Blue
        "#2A9D8F",  # Jade Green
        "#8B4513",  # Brown
        "#F77F00",  # Vivid Orange
        "#8D99AE",  # Slate Gray
        "#264653",  # Deep Blue
        "#FFDD00",  # Bright Yellow
        "#06D6A0",  # Aqua Green
        "#3D348B",  # Deep Purple
        "#FF1493",  # Bright Pink
        "#73A942",  # Olive Green
        "#006400",  # Deep Green
        "#DAA520",  # Golden
        "#2803fc",  # Deep blue
        "#FF00FF",  # Magenta
        "#00FFFF",  # Cyan
        "#FF4500",  # OrangeRed
    ]
    label_colors = {
        label: color_palette[i % len(color_palette)] for i, label in enumerate(labels)
    }
    return label_colors


def color_func(word, **kwargs):
    """
    Color function that retrieves the color based on the word's entity label.
    """
    label = word_to_label.get(word, None)
    return label_colors.get(label, "#000000")


def plot_wordclouds_with_legend(*datasets):
    """
    Plots a word cloud for each dataset, with words colored according to entity type.
    """
    fig, axes = plt.subplots(4, 1, figsize=(6, 12))
    global label_colors, word_to_label
    word_to_label = {}

    present_labels = set()
    for data, _ in datasets:
        word_freq_by_type = generate_wordcloud_data(data)
        present_labels.update(word_freq_by_type.keys())

    label_colors = assign_colors_to_labels(present_labels)

    legend_labels = set()

    for i, (data, title) in enumerate(datasets):
        word_freq_by_type = generate_wordcloud_data(data)
        wc = WordCloud(width=1000, height=300, background_color="white")

        combined_frequencies = {}
        for label, words in word_freq_by_type.items():
            for word, count in words.items():
                combined_frequencies[word] = count
                word_to_label[word] = label
                legend_labels.add(label)

        wc.generate_from_frequencies(combined_frequencies)

        axes[i].imshow(wc.recolor(color_func=color_func), interpolation="bilinear")
        axes[i].set_title(title, fontsize=14, weight="bold")
        axes[i].axis("off")

    fig.subplots_adjust(hspace=0.4)

    filtered_handles = [
        plt.Line2D([0], [0], color=label_colors[label], lw=4) for label in legend_labels
    ]
    filtered_labels = [label for label in legend_labels]
    fig.legend(
        filtered_handles,
        filtered_labels,
        title="Entity Type",
        fontsize=8,
        title_fontsize=10,
        loc="lower center",
        ncol=5,
        bbox_to_anchor=(0.5, 0.17),
    )

    plt.tight_layout()
    plt.subplots_adjust(bottom=0.25)
    plt.savefig("figures/wordclouds.pdf", bbox_inches="tight", dpi=600)
    plt.show()


technique = "Appeal_to_Authority"
cidii_df_entities = get_top_n_entities_per_persuasion_technique(
    cidii_df, n=50, technique=technique
)
covid_df_entities = get_top_n_entities_per_persuasion_technique(
    covid_df, n=50, technique=technique
)
climate_fever_df_entities = get_top_n_entities_per_persuasion_technique(
    climate_fever_df, n=50, technique=technique
)
euvsdisinfo_df_entities = get_top_n_entities_per_persuasion_technique(
    euvsdisinfo_df, n=50, technique=technique
)

# Plot word clouds with legend
plot_wordclouds_with_legend(
    (cidii_df_entities, "CIDII"),
    (covid_df_entities, "COVID"),
    (climate_fever_df_entities, "Climate Fever"),
    (euvsdisinfo_df_entities, "EUvsDisinfo"),
)


#  Credit to https://towardsdatascience.com/explorations-in-named-entity-recognition-and-was-eleanor-roosevelt-right-671271117218
# PERSON:      People, including fictional.
# NORP:        Nationalities or religious or political groups.
# FAC:         Buildings, airports, highways, bridges, etc.
# ORG:         Companies, agencies, institutions, etc.
# GPE:         Countries, cities, states.
# LOC:         Non-GPE locations, mountain ranges, bodies of water.
# PRODUCT:     Objects, vehicles, foods, etc. (Not services.)
# EVENT:       Named hurricanes, battles, wars, sports events, etc.
# WORK_OF_ART: Titles of books, songs, etc.
# LAW:         Named documents made into laws.
# LANGUAGE:    Any named language.
# DATE:        Absolute or relative dates or periods.
# TIME:        Times smaller than a day.
# PERCENT:     Percentage, including ”%“.
# MONEY:       Monetary values, including unit.
# QUANTITY:    Measurements, as of weight or distance.
# ORDINAL:     “first”, “second”, etc.
# CARDINAL:    Numerals that do not fall under another type.

In [None]:
# Example of appeal to authority mentioning an entity
climate_fever_df[climate_fever_df["text"].str.casefold().str.contains("ipcc")].iloc[4][
    "text"
]

In [None]:
# Example of exaggeration-minimisation
climate_fever_df[
    climate_fever_df["persuasion_techniques"].apply(
        lambda x: "Exaggeration-Minimisation" in x
    )
].iloc[4]["text"]

In [None]:
# example of questioning the reputation with a text length smaller than 100
df = euvsdisinfo_df[
    euvsdisinfo_df["persuasion_techniques"].apply(
        lambda x: "Questioning_the_Reputation" in x
    )
]
df[df["text"].str.len() < 100].iloc[9]["text"]

In [None]:
# example of appeal to fear in covid
df = covid_df[
    covid_df["persuasion_techniques"].apply(lambda x: "Appeal_to_Fear-Prejudice" in x)
]
df[df["text"].str.len() < 100].iloc[42]["text"]

In [None]:
# example of repetition in cidii
df = cidii_df[cidii_df["persuasion_techniques"].apply(lambda x: "Repetition" in x)]
print(df[df["text"].str.len() > 100].iloc[2]["text"])
print(df[df["text"].str.len() > 100].iloc[8]["text"])

In [None]:
# example of repetition in cidii
df = cidii_df[
    cidii_df["persuasion_techniques"].apply(lambda x: "Name_Calling-Labeling" in x)
]
print(df[df["text"].str.len() > 100].iloc[2]["text"])
print(df[df["text"].str.len() > 100].iloc[8]["text"])

In [None]:
cidii_df[
    (cidii_df["persuasion_techniques"].apply(lambda x: "Appeal_to_Fear-Prejudice" in x))
    & (cidii_df["label"] == 1)
    # & (cidii_df["text"].apply(lambda x: "?" in x))
]["text"]

In [None]:
covid_df[
    (covid_df["persuasion_techniques"].apply(lambda x: "Appeal_to_Fear-Prejudice" in x))
    & (covid_df["label"] == 1)
    # & (covid_df["text"].apply(lambda x: "?" in x))
]["text"]

In [None]:
climate_fever_df[
    (
        climate_fever_df["persuasion_techniques"].apply(
            lambda x: "Appeal_to_Fear-Prejudice" in x
        )
    )
    & (climate_fever_df["label"] == 1)
]["text"]

In [None]:
euvsdisinfo_df[
    (
        euvsdisinfo_df["persuasion_techniques"].apply(
            lambda x: "Appeal_to_Fear-Prejudice" in x
        )
    )
    & (climate_fever_df["label"] == 1)
]["text"]

In [None]:
def get_df_statistics(df):
    sent_str_len = df["text"].str.len().mean()
    num_sent = len(df)
    avg_persuasion_per_text = df["persuasion_techniques"].apply(lambda x: len(x)).mean()
    avg_entities_per_text = df["entities"].apply(len).mean()

    df = rebuild_articles(df)
    num_articles = len(df)
    # num_true = df["label"].sum()
    # num_false = len(df) - num_true

    return {
        "# Articles": num_articles,
        # "# True": num_true,
        # "# False": num_false,
        "# Sentences": num_sent,
        "Avg. Sentence Length": np.round(sent_str_len, 1),
        "Avg. Persuasion Techniques per Sentence": np.round(avg_persuasion_per_text, 1),
        "Avg. Entities per Sentence": np.round(avg_entities_per_text, 1),
    }


rows = []
for dataset in [cidii_df, covid_df, climate_fever_df, euvsdisinfo_df]:
    rows.append(get_df_statistics(dataset))

statistics_df = pd.DataFrame(
    rows, index=["CIDII", "COVID", "Climate Fever", "EUvsDisinfo"]
).T

statistics_df

In [None]:
def plot_pt_distribution_single_bar(*dfs, labels, semeval_labels=None):
    def prepare_data(df):
        if "persuasion_techniques" not in df.columns:
            raise ValueError("DataFrame must contain 'persuasion_techniques' column.")

        df["techniques_list"] = df["persuasion_techniques"]
        df_exploded = df.explode("techniques_list")
        df_exploded = df_exploded.dropna(subset=["techniques_list"]) 

        technique_counts = Counter(df_exploded["techniques_list"])
        df_counts = (
            pd.DataFrame.from_dict(technique_counts, orient="index", columns=["Count"])
            .reindex(semeval_labels)
            .fillna(0)
        )
        total_count = df_counts["Count"].sum()

        if total_count > 0:
            df_counts["Relative_Frequency (%)"] = (
                df_counts["Count"] / total_count
            ) * 100
        else:
            df_counts["Relative_Frequency (%)"] = 0

        return df_counts[["Relative_Frequency (%)"]].sort_index(), total_count

    prepared_data = []
    total_counts = {}
    for df, label in zip(dfs, labels):
        df_counts, total_count = prepare_data(df)
        prepared_data.append(df_counts["Relative_Frequency (%)"])
        total_counts[label] = total_count

    combined_df = pd.DataFrame(prepared_data, index=labels).T

    combined_df["Total_Frequency"] = combined_df.sum(axis=1)
    combined_df = combined_df.sort_values(by="Total_Frequency", ascending=True).drop(
        columns="Total_Frequency"
    )

    max_frequency = combined_df.values.max()

    fig, ax = plt.subplots(figsize=(6, 5))
    bar_height = 0.20
    indices = np.arange(len(combined_df))

    for i, label in enumerate(labels):
        ax.barh(indices + i * bar_height, combined_df[label], bar_height, label=label)

    ax.set_xlabel("Proportion (%)", fontsize=10)
    ax.set_yticks(indices + bar_height * (len(labels) - 1) / 2)

    ax.set_yticklabels([idx.replace("_", " ") for idx in combined_df.index], fontsize=10)
    ax.set_xlim(0, max_frequency + 5)
    ax.legend(fontsize=8, title="Disinformation Domain", title_fontsize=10)

    plt.grid(axis="x", linestyle="--", alpha=0.6)
    plt.tight_layout()
    plt.savefig("figures/pt_distribution.pdf", bbox_inches="tight", dpi=600)
    plt.show()

    return combined_df, [v for k, v in total_counts.items()], list(reversed(combined_df.index))


# cidii_df_rebuilt = rebuild_articles(cidii_df)
# covid_df_rebuilt = rebuild_articles(covid_df)
# climate_fever_df_rebuilt = rebuild_articles(climate_fever_df)
# euvsdisinfo_df_rebuilt = rebuild_articles(euvsdisinfo_df)

# cidii_df["persuasion_techniques"] = cidii_df[
#     "persuasion_techniques"
# ].apply(lambda x: ",".join(x))
# covid_df["persuasion_techniques"] = covid_df[
#     "persuasion_techniques"
# ].apply(lambda x: ",".join(x))
# climate_fever_df["persuasion_techniques"] = climate_fever_df[
#     "persuasion_techniques"
# ].apply(lambda x: ",".join(x))
# euvsdisinfo_df["persuasion_techniques"] = euvsdisinfo_df[
#     "persuasion_techniques"
# ].apply(lambda x: ",".join(x))

frequencies_df, total_counts, yticklabels = plot_pt_distribution_single_bar(
    cidii_df,
    covid_df,
    climate_fever_df,
    euvsdisinfo_df,
    labels=("Islamic Issues", "COVID-19", "Climate Change", "Russo-Ukrainian War"),
    semeval_labels=SEMEVAL_LABELS,
)


def highlight_max_in_row(row):
    is_max = row == row.max()
    return ["background-color: red" if v else "" for v in is_max]


styled_df = frequencies_df.style.apply(highlight_max_in_row, axis=1)
styled_df

In [None]:
datasets = ["CIDII", "COVID", "Climate Fever", "EUvsDisinfo"]
frequencies_df = frequencies_df.loc[SEMEVAL_LABELS]
data_proportions = frequencies_df.T.to_numpy() / 100
# Compute pairwise Jensen-Shannon divergence between each dataset
js_divergence_results = {}
for i in range(len(data_proportions)):
    for j in range(i + 1, len(data_proportions)):
        # Calculate Jensen-Shannon divergence
        js_divergence = jensenshannon(data_proportions[i], data_proportions[j])
        js_divergence_results[(datasets[i], datasets[j])] = js_divergence

# Display the results
for pair, divergence in js_divergence_results.items():
    print(
        f"Jensen-Shannon Divergence between {pair[0]} and {pair[1]}: {divergence:.3f}"
    )

In [None]:
def calculate_odds_ratios_and_significance(data_proportions, raw_data, labels):
    """
    Calculate odds ratios and significance for each dataset and technique.

    Parameters:
    - data_proportions: pd.DataFrame, proportions of each technique in each dataset.
    - raw_data: list of DataFrames, one for each dataset containing raw counts.
    - labels: list, persuasion technique labels.

    Returns:
    - odds_ratios_df: pd.DataFrame of odds ratios.
    - significance_matrix: pd.DataFrame of boolean values indicating statistical significance.
    """
    # Step 1: Calculate Odds Ratios
    if not all(data_proportions.sum(axis=1).round(10) == 1.0):
        raise ValueError("Proportions in each row must sum to 1.")

    odds_ratios_matrix = pd.DataFrame(columns=data_proportions.columns, index=data_proportions.index)
    for i, domain in enumerate(data_proportions.index):
        # Extract the dataset to compare
        dataset_proportions = data_proportions[data_proportions.index == domain].squeeze()

        # Combine the other datasets by averaging proportions
        combined_df = data_proportions[data_proportions.index != domain]
        combined_proportions = combined_df.mean(axis=0)

        # Calculate odds for each technique in both combined and single dataset
        odds_combined = combined_proportions / ((1 - combined_proportions) + 1e-10)
        odds_dataset = dataset_proportions / ((1 - dataset_proportions) + 1e-10)

        odds_ratios = odds_dataset / odds_combined
        odds_ratios_matrix.loc[domain] = odds_ratios

    # Transpose to match expected structure
    odds_ratios_df = odds_ratios_matrix[labels].T

    # Step 2: Create Raw Counts Dictionary
    raw_counts = {}
    for i, (dataset, df) in enumerate(zip(data_proportions.index, raw_data)):
        df_a = raw_data[i]
        df_b = raw_data[:i] + raw_data[i + 1 :]
        for technique in labels:
            df_a_technique = df_a[df_a["persuasion_techniques"].apply(lambda x: technique in x)]
            df_a_not_technique = df_a[df_a["persuasion_techniques"].apply(lambda x: technique not in x)]

            df_b_technique = pd.concat(
                [
                    df[df["persuasion_techniques"].apply(lambda x: technique in x)]
                    for df in df_b
                ]
            )
            df_b_not_technique = pd.concat(
                [
                    df[df["persuasion_techniques"].apply(lambda x: technique not in x)]
                    for df in df_b
                ]
            )

            a = len(df_a_technique)
            b = len(df_a_not_technique)
            c = len(df_b_technique)
            d = len(df_b_not_technique)
            raw_counts[(technique, dataset)] = {"a": a, "b": b, "c": c, "d": d}

    # Step 3: Calculate Statistical Significance
    significance_matrix = pd.DataFrame(index=odds_ratios_df.index, columns=odds_ratios_df.columns, dtype=bool)

    for technique in odds_ratios_df.index:
        for dataset in odds_ratios_df.columns:
            counts = raw_counts[(technique, dataset)]
            a, b, c, d = counts['a'], counts['b'], counts['c'], counts['d']
            
            _, p_value = fisher_exact([[a, b], [c, d]])
            
            significance_matrix.loc[technique, dataset] = p_value < 0.05

    odds_ratios_df = odds_ratios_df.astype(float)
    odds_ratios_df = odds_ratios_df.round(2)
    odds_ratios_df = odds_ratios_df.reindex(yticklabels)
    significance_matrix = significance_matrix.reindex(yticklabels)
    
    return odds_ratios_df, significance_matrix

data_proportions_df = pd.DataFrame(data_proportions, index=["Islamic Issues", "COVID-19", "Climate Change", "Russo-Ukrainian War"], columns=SEMEVAL_LABELS)
data_proportions_df= data_proportions_df[yticklabels]
odds_ratios_df, significance_mask = calculate_odds_ratios_and_significance(data_proportions_df, [cidii_df, covid_df, climate_fever_df, euvsdisinfo_df], SEMEVAL_LABELS)

In [None]:
def highlight_significant_values(odds_ratios_df, significance_matrix):
    """
    Highlight significant values in the odds_ratios_df based on the significance_matrix.

    Parameters:
    - odds_ratios_df: pd.DataFrame containing odds ratios.
    - significance_matrix: pd.DataFrame containing True/False values indicating significance.

    Returns:
    - Styled DataFrame with significant values highlighted.
    """
    def highlight_cell(is_significant):
        if is_significant:
            return "background-color: red; font-weight: bold;"
        return ""

    # Apply the highlighting function to each cell
    styled_df = odds_ratios_df.style.format(
        precision=2
    ).apply(
        lambda row: [
            highlight_cell(significance_matrix.loc[row.name, col])
            for col in odds_ratios_df.columns
        ],
        axis=1
    )

    return styled_df

odds_ratios_df.index = [idx.replace("_", " ") for idx in odds_ratios_df.index]
significance_mask.index = [idx.replace("_", " ") for idx in significance_mask.index]
styled_odds_ratios_df = highlight_significant_values(odds_ratios_df, significance_mask)
styled_odds_ratios_df

In [None]:
cidii_df = load_dataset("cidii")
cidii_liwc_df = pd.read_csv("datasets/liwc/liwc_cidii.csv")
cidii_liwc_df = cidii_liwc_df[
    cidii_liwc_df[["id_sentence", "id_article"]]
    .apply(tuple, axis=1)
    .isin(cidii_df[["id_sentence", "id_article"]].apply(tuple, axis=1))
].reset_index(drop=True)

covid_df = load_dataset("covid")
covid_liwc_df = pd.read_csv("datasets/liwc/liwc_covid.csv")
covid_liwc_df = covid_liwc_df[
    covid_liwc_df[["id_sentence", "id_article"]]
    .apply(tuple, axis=1)
    .isin(covid_df[["id_sentence", "id_article"]].apply(tuple, axis=1))
].reset_index(drop=True)

climate_fever_df = load_dataset("climate_fever")
climate_fever_liwc_df = pd.read_csv("datasets/liwc/liwc_climate_fever.csv")
climate_fever_liwc_df = climate_fever_liwc_df[
    climate_fever_liwc_df[["id_sentence", "id_article"]]
    .apply(tuple, axis=1)
    .isin(climate_fever_df[["id_sentence", "id_article"]].apply(tuple, axis=1))
].reset_index(drop=True)

euvsdisinfo_df = load_dataset("euvsdisinfo")
euvsdisinfo_liwc_df = pd.read_csv("datasets/liwc/liwc_euvsdisinfo.csv")
euvsdisinfo_liwc_df = euvsdisinfo_liwc_df[
    euvsdisinfo_liwc_df[["id_sentence", "id_article"]]
    .apply(tuple, axis=1)
    .isin(euvsdisinfo_df[["id_sentence", "id_article"]].apply(tuple, axis=1))
].reset_index(drop=True)

In [None]:
def calculate_average_liwc_by_technique(df):
    """
    Calculate the average LIWC scores for each persuasion technique in the dataset.

    Parameters:
    - df: DataFrame containing columns 'persuasion_techniques' and LIWC features.

    Returns:
    - A DataFrame with persuasion techniques as rows and average LIWC scores as columns.
    """

    df["persuasion_techniques"] = df["persuasion_techniques"].fillna("")

    unique_techniques = SEMEVAL_LABELS

    liwc_features = df.columns[df.columns.get_loc("Segment") + 1 :]

    average_liwc_scores = {}

    for technique in unique_techniques:
        technique_rows = df[
            df["persuasion_techniques"].str.contains(technique, regex=False)
        ]

        technique_mean = technique_rows[liwc_features].sum()

        average_liwc_scores[technique] = technique_mean

    average_liwc_df = pd.DataFrame(average_liwc_scores).T
    average_liwc_df.index.name = "Persuasion Technique"

    return average_liwc_df.fillna(0)

In [None]:
def plot_effect_size_heatmaps(
    persuasion_technique: str, datasets: list, dataset_names: list, top_n: int
):
    """
    Computes effect sizes for each LIWC feature in a given persuasion technique
    and plots heatmaps for each dataset, displaying only the top_n LIWC features
    with the highest effect sizes.

    Parameters:
        persuasion_technique (str): The persuasion technique for which to calculate effect sizes.
        datasets (list): A list of 4 DataFrames, each representing a dataset with LIWC features as columns.
                         Each DataFrame should contain rows corresponding to various persuasion techniques,
                         including the one specified in persuasion_technique.
        dataset_names (list): A list of 4 strings representing the names of each dataset, in the same order.
        top_n (int): The number of top LIWC features to display based on highest effect size.
    """
    assert (
        len(datasets) == 4 and len(dataset_names) == 4
    ), "Please provide exactly 4 datasets and 4 dataset names."

    liwc_features = datasets[0].columns
    for dataset in datasets:
        assert list(dataset.columns) == list(
            liwc_features
        ), "All datasets must have the same LIWC feature columns."

    technique_means = [dataset.loc[persuasion_technique] for dataset in datasets]

    effect_size_dicts = []

    for i, dataset_mean in enumerate(technique_means):
        effect_sizes = []
        comparison_names = []

        for j, other_mean in enumerate(technique_means):
            if i != j:
                effect_size = dataset_mean / other_mean
                effect_sizes.append(effect_size)
                comparison_names.append(dataset_names[j])

        effect_size_df = pd.DataFrame(
            effect_sizes, index=comparison_names, columns=liwc_features
        )
        effect_size_dicts.append(effect_size_df)

    for i, effect_df in enumerate(effect_size_dicts):
        sorted_columns = (
            effect_df.mean(axis=0).sort_values(ascending=False).index[:top_n]
        )
        effect_df_top_n = effect_df[sorted_columns]

        plt.figure(figsize=(10, 6))
        sns.heatmap(effect_df_top_n, annot=True, cmap="coolwarm", cbar=True)
        plt.title(
            f"Top {top_n} Effect Size Heatmap for {dataset_names[i]} - Persuasion Technique: {persuasion_technique}"
        )
        plt.xlabel("LIWC Features")
        plt.ylabel("Comparison Dataset")
        plt.xticks(rotation=45)
        plt.show()

In [None]:
avg_cidii_liwc = calculate_average_liwc_by_technique(cidii_liwc_df)
avg_covid_liwc = calculate_average_liwc_by_technique(covid_liwc_df)
avg_climate_fever_liwc = calculate_average_liwc_by_technique(climate_fever_liwc_df)
avg_euvsdisinfo_liwc = calculate_average_liwc_by_technique(euvsdisinfo_liwc_df)

In [None]:
liwc_categories = {
    "Summary Variables": [
        "WC",
        "Analytic",
        "Clout",
        "Authentic",
        "Tone",
        "WPS",
        "BigWords",
        "Dic",
    ],
    "Linguistic Variables": [
        "function",
        "pronoun",
        "ppron",
        "i",
        "we",
        "you",
        "shehe",
        "they",
        "ipron",
        "det",
        "article",
        "number",
        "prep",
        "auxverb",
        "adverb",
        "conj",
        "negate",
        "verb",
        "adj",
        "quantity",
    ],
    "Psychological Processes": [
        "Drives",
        "affiliation",
        "achieve",
        "power",
        # "Cognition",
        "allnone",
        "cogproc",
        "insight",
        "cause",
        "discrep",
        "tentat",
        "certitude",
        "differ",
        "memory",
        "Affect",
        "tone_pos",
        "tone_neg",
        "emotion",
        "emo_pos",
        "emo_neg",
        "emo_anx",
        "emo_anger",
        "emo_sad",
        "swear",
    ],
    "Social processes": [
        "socbehav",
        "prosocial",
        "polite",
        "conflict",
        "moral",
        "comm",
        "socrefs",
        "family",
        "friend",
        "female",
        "male",
    ],
    "Expanded Dictionary": ["Culture", "politic", "ethnicity", "tech"],
    "Lifestyle": ["leisure", "home", "work", "money", "relig"],
    "Physical": [
        "health",
        "illness",
        "wellness",
        "mental",
        "substances",
        "sexual",
        "food",
        "death",
    ],
    "States": ["need", "want", "acquire", "lack", "fulfill", "fatigue"],
    "Motives": ["reward", "risk", "curiosity", "allure"],
    "Time orientation": ["time", "focuspast", "focuspresent", "focusfuture"],
    "Conversational": ["netspeak", "assent", "nonflu", "filler"],
    # "Affect": [
    #     "tone_pos",
    #     "tone_neg",
    #     "emotion",
    #     "emo_pos",
    #     "emo_neg",
    #     "emo_anx",
    #     "emo_anger",
    #     "emo_sad",
    # ],
    # "Cognition": [
    #     "allnone",
    #     "cogproc",
    #     "insight",
    #     "cause",
    #     "discrep",
    #     "tentat",
    #     "certitude",
    #     "differ",
    #     "memory",
    # ],
    "Drives": ["affiliation", "achieve", "power"],
    "Culture": ["politic", "ethnicity", "tech"],
    "Perception": [
        "attention",
        "motion",
        "space",
        "visual",
        "auditory",
        "feeling",
        "time",
        "focuspast",
        "focuspresent",
        "focusfuture",
    ],
    # "Punctuation": [
    #     "AllPunc", "Period", "Comma", "QMark", "Exclam", "Apostro", "OtherP"
    # ]
}

In [None]:
from matplotlib import patches as mpatches


def plot_consolidated_top_features(
    datasets, persuasion_technique, top_n, liwc_categories
):
    # Step 1: Extract top N features for each dataset and consolidate all unique features
    all_top_features = set()
    for dataset_name, avg_df in datasets.items():
        heatmap_data = []
        for category, features_list in liwc_categories.items():
            for feature in features_list:
                num = avg_df.loc[persuasion_technique, feature]
                div = avg_df.loc[:, feature][
                    np.nonzero(avg_df.loc[:, feature])[0]
                ].mean()
                value = num / div if div != 0 else 0
                heatmap_data.append((feature, value))

        # Select top N features for the current dataset
        top_features = sorted(heatmap_data, key=lambda x: x[1], reverse=True)[:top_n]
        all_top_features.update([feature for feature, _ in top_features])

    # Step 2: Consolidate all unique features into a definitive list
    definitive_features = list(all_top_features)

    # Map features to their coarse groups
    feature_group_map = {}
    for group, features in liwc_categories.items():
        for feature in features:
            if feature in definitive_features:
                feature_group_map[feature] = group

    # Step 3: Populate values for all datasets for these definitive features
    consolidated_data = {}
    for feature in definitive_features:
        consolidated_data[feature] = {}
        for dataset_name, avg_df in datasets.items():
            num = avg_df.loc[persuasion_technique, feature]
            div = avg_df.loc[:, feature][np.nonzero(avg_df.loc[:, feature])[0]].mean()
            value = num / div if div != 0 else 0
            consolidated_data[feature][dataset_name] = value

    # Convert to DataFrame
    consolidated_df = pd.DataFrame.from_dict(consolidated_data, orient="index").fillna(
        0
    )

    # Step 4: Sort and organize features by groups
    sorted_features = sorted(
        definitive_features, key=lambda f: (feature_group_map[f], f)
    )
    sorted_df = consolidated_df.loc[sorted_features]

    # Step 5: Normalize each column individually to adjust color gradient per column
    # sorted_df = sorted_df.apply(lambda x: (x - x.min()) / (x.max() - x.min()) if x.max() > x.min() else x, axis=0)

    # Step 6: Plot the heatmap
    plt.figure(figsize=(15, 4))  # Adjust height based on features
    sns.heatmap(
        sorted_df.transpose(),
        annot=True,
        cmap="YlGnBu",
        fmt=".2f",
        cbar_kws={"label": "Normalized Value (per column)"},
        linewidths=0.5,
        xticklabels=sorted_features,
    )
    plt.xlabel("LIWC Feature")
    plt.ylabel("Disinformation Domain")
    plt.xticks(rotation=45, ha="right")
    plt.tight_layout()

    # Add legend for groups
    unique_groups = list({feature_group_map[f] for f in definitive_features})
    group_patches = [mpatches.Patch(label=group) for group in unique_groups]

    plt.show()


# Example usage
datasets = {
    "Islamic Issues": avg_cidii_liwc,
    "COVID-19": avg_covid_liwc,
    "Climate Change": avg_climate_fever_liwc,
    "Russo-Ukrainian War": avg_euvsdisinfo_liwc,
}

technique = "Appeal_to_Fear-Prejudice"
plot_consolidated_top_features(datasets, technique, 8, liwc_categories)

In [None]:
cidii_liwc_df["Analytic"]

In [None]:
def encode_semeval_labels(df):
    """
    Encode SEMEVAL labels and normalize LIWC features.

    Parameters:
    - df (pd.DataFrame): Input DataFrame containing LIWC features and persuasion techniques.

    Returns:
    - labels (np.array): Binary matrix for persuasion techniques.
    - standardized_features (np.array): Normalized LIWC features.
    - feature_names (list): List of LIWC feature names.
    """
    labels = np.zeros((len(df), len(SEMEVAL_LABELS)))
    for i, row in df.iterrows():
        for label in [
            persuasion
            for persuasion in row["persuasion_techniques"].split(",")
            if persuasion in SEMEVAL_LABELS
        ]:
            labels[i, SEMEVAL_LABELS.index(label)] = 1

    # Flatten and extract LIWC feature names
    features = [feature for _, feature in liwc_categories.items()]
    feature_names = [f for sublist in features for f in sublist]

    # Extract LIWC features and standardize
    liwc_features = df.loc[:, feature_names].to_numpy()
    # scaler = StandardScaler()
    # standardized_features = scaler.fit_transform(liwc_features)
    standardized_features = liwc_features

    return labels, standardized_features


# Encode datasets with feature and technique names
cidii_labels, cidii_features = encode_semeval_labels(cidii_liwc_df)
covid_labels, covid_features = encode_semeval_labels(covid_liwc_df)
climate_fever_labels, climate_fever_features = encode_semeval_labels(
    climate_fever_liwc_df
)
euvsdisinfo_labels, euvsdisinfo_features = encode_semeval_labels(euvsdisinfo_liwc_df)

liwc_feature_names = [f for sublist in liwc_categories.values() for f in sublist]


# Compute correlations with actual feature and technique names
def compute_correlations_with_significance(
    features, labels, feature_names, technique_names
):
    """
    Compute point-biserial correlations and p-values with named indices.

    Parameters:
    - features (np.array): Matrix of LIWC features (N x F).
    - labels (np.array): Binary matrix of persuasion techniques (N x T).
    - feature_names (list): List of LIWC feature names.
    - technique_names (list): List of persuasion technique names.

    Returns:
    - correlation_matrix (pd.DataFrame): DataFrame of correlation coefficients.
    - p_value_matrix (pd.DataFrame): DataFrame of p-values.
    """
    features = np.nan_to_num(features, nan=0)
    num_features = features.shape[1]
    num_techniques = labels.shape[1]

    # Use feature and technique names for indexing
    correlation_matrix = pd.DataFrame(index=feature_names, columns=technique_names)
    p_value_matrix = pd.DataFrame(index=feature_names, columns=technique_names)

    # Compute correlations for each feature-technique pair
    for feature_idx in range(num_features):
        for technique_idx in range(num_techniques):
            corr, p_val = pointbiserialr(
                features[:, feature_idx], labels[:, technique_idx]
            )
            correlation_matrix.iloc[feature_idx, technique_idx] = corr
            p_value_matrix.iloc[feature_idx, technique_idx] = p_val

    return correlation_matrix, p_value_matrix


# Define SEMEVAL_LABELS as the technique names
technique_names = SEMEVAL_LABELS

# Compute correlations for each dataset with named indices
cidii_correlations, cidii_p_values = compute_correlations_with_significance(
    cidii_features, cidii_labels, liwc_feature_names, technique_names
)
covid_correlations, covid_p_values = compute_correlations_with_significance(
    covid_features, covid_labels, liwc_feature_names, technique_names
)
climate_fever_correlations, climate_fever_p_values = (
    compute_correlations_with_significance(
        climate_fever_features,
        climate_fever_labels,
        liwc_feature_names,
        technique_names,
    )
)
euvsdisinfo_correlations, euvsdisinfo_p_values = compute_correlations_with_significance(
    euvsdisinfo_features, euvsdisinfo_labels, liwc_feature_names, technique_names
)

In [None]:
def generate_correlation_summary_with_absolute_and_pvalues(
    correlation_matrix, p_value_matrix, dataset_name, p_value_threshold=0.05
):
    # Flatten the correlation matrix
    flattened = correlation_matrix.stack().reset_index()
    flattened.columns = ["LIWC Feature", "Persuasion Technique", "Correlation"]
    flattened = flattened.dropna()

    # Flatten the p-value matrix
    p_values = p_value_matrix.stack().reset_index(drop=True)
    flattened["p-value"] = p_values

    # Add the dataset name
    flattened["Dataset"] = dataset_name

    # Add absolute correlation column
    flattened["Absolute Correlation"] = flattened["Correlation"].abs()

    # Add interpretation column
    def interpret_correlation(row):
        if row["p-value"] < p_value_threshold:
            return True
        else:
            return False

    flattened["Significant"] = flattened.apply(interpret_correlation, axis=1)
    return flattened


# Generate summary tables for each dataset with p-values and filter for significance
p_value_threshold = 0.05

cidii_summary = generate_correlation_summary_with_absolute_and_pvalues(
    cidii_correlations, cidii_p_values, "Islamic Issues", p_value_threshold
)
covid_summary = generate_correlation_summary_with_absolute_and_pvalues(
    covid_correlations, covid_p_values, "COVID-19", p_value_threshold
)
climate_fever_summary = generate_correlation_summary_with_absolute_and_pvalues(
    climate_fever_correlations,
    climate_fever_p_values,
    "Climate Change",
    p_value_threshold,
)
euvsdisinfo_summary = generate_correlation_summary_with_absolute_and_pvalues(
    euvsdisinfo_correlations,
    euvsdisinfo_p_values,
    "Russo-Ukrainian War",
    p_value_threshold,
)

# Get only significant correlations
cidii_significant = cidii_summary[cidii_summary["Significant"]]
covid_significant = covid_summary[covid_summary["Significant"]]
climate_fever_significant = climate_fever_summary[climate_fever_summary["Significant"]]
euvsdisinfo_significant = euvsdisinfo_summary[euvsdisinfo_summary["Significant"]]

# Ensure 'Absolute Correlation' is numeric in each dataset summary
cidii_significant["Absolute Correlation"] = pd.to_numeric(
    cidii_significant["Absolute Correlation"], errors="coerce"
)
covid_significant["Absolute Correlation"] = pd.to_numeric(
    covid_significant["Absolute Correlation"], errors="coerce"
)
climate_fever_significant["Absolute Correlation"] = pd.to_numeric(
    climate_fever_significant["Absolute Correlation"], errors="coerce"
)
euvsdisinfo_significant["Absolute Correlation"] = pd.to_numeric(
    euvsdisinfo_significant["Absolute Correlation"], errors="coerce"
)

# Remove duplicate rows
cidii_significant = cidii_significant.drop_duplicates(
    ["LIWC Feature", "Persuasion Technique", "Dataset"]
)
covid_significant = covid_significant.drop_duplicates(
    ["LIWC Feature", "Persuasion Technique", "Dataset"]
)
climate_fever_significant = climate_fever_significant.drop_duplicates(
    ["LIWC Feature", "Persuasion Technique", "Dataset"]
)
euvsdisinfo_significant = euvsdisinfo_significant.drop_duplicates(
    ["LIWC Feature", "Persuasion Technique", "Dataset"]
)

# Extract the top 10 absolute correlations for each dataset
topn = 20
cidii_significant = cidii_significant.nlargest(topn, "Absolute Correlation")
covid_significant = covid_significant.nlargest(topn, "Absolute Correlation")
climate_fever_significant = climate_fever_significant.nlargest(
    topn, "Absolute Correlation"
)
euvsdisinfo_significant = euvsdisinfo_significant.nlargest(topn, "Absolute Correlation")

# Get all unique pairs of LIWC features and Persuasion Techniques
all_tuples = []
for dataset in [
    cidii_significant,
    covid_significant,
    climate_fever_significant,
    euvsdisinfo_significant,
]:
    all_tuples.extend(
        list(
            [
                tuple(row)
                for row in dataset[["LIWC Feature", "Persuasion Technique"]].values
            ]
        )
    )
all_tuples = list(set(all_tuples))

# Retrieve these features from the summary datasets
cidii_summary = cidii_summary[
    cidii_summary.apply(
        lambda row: (row["LIWC Feature"], row["Persuasion Technique"]) in all_tuples,
        axis=1,
    )
]
covid_summary = covid_summary[
    covid_summary.apply(
        lambda row: (row["LIWC Feature"], row["Persuasion Technique"]) in all_tuples,
        axis=1,
    )
]
climate_fever_summary = climate_fever_summary[
    climate_fever_summary.apply(
        lambda row: (row["LIWC Feature"], row["Persuasion Technique"]) in all_tuples,
        axis=1,
    )
]
euvsdisinfo_summary = euvsdisinfo_summary[
    euvsdisinfo_summary.apply(
        lambda row: (row["LIWC Feature"], row["Persuasion Technique"]) in all_tuples,
        axis=1,
    )
]

# Combine top correlations into a single DataFrame for display
top_correlations_all = pd.concat(
    [cidii_summary, covid_summary, climate_fever_summary, euvsdisinfo_summary]
)

# Sort and remove any remaining duplicates based on feature, technique, and dataset
top_correlations_all = top_correlations_all.sort_values("LIWC Feature", ascending=False)

# Display the final table with filtered significant results
top_correlations_all[
    ["LIWC Feature", "Persuasion Technique", "Dataset", "Correlation", "p-value"]
]

# Pivot the data to create a table with datasets as columns
pivot_table = top_correlations_all.pivot_table(
    index=["LIWC Feature", "Persuasion Technique"],
    columns=["Dataset"],
    values="Correlation",
)

pivot_table.index = pd.MultiIndex.from_tuples(
    [(technique, feature) for technique, feature in pivot_table.index],
    names=["Persuasion Technique", "LIWC Feature"],
)

# Transpose the table so the techniques and features are aligned as row levels
pivot_table = pivot_table.T


# Modify the mask to retain all values but annotate non-significant ones distinctly
mask = pd.isnull(pivot_table)

pivot_table = top_correlations_all.pivot_table(
    index=["LIWC Feature", "Persuasion Technique"],
    columns=["Dataset"],
    values="Correlation",
).dropna()

pivot_table.index = pd.MultiIndex.from_tuples(
    [(technique, feature) for technique, feature in pivot_table.index],
    names=["Persuasion Technique", "LIWC Feature"],
)

# Transpose the table so the techniques and features are aligned as row levels
pivot_table = pivot_table.T


# Modify the mask to retain all values but annotate non-significant ones distinctly
mask = pd.isnull(pivot_table)


# Custom annotation formatting function
def custom_fmt_with_significance(x, p):
    """
    Format the annotation based on significance.
    Add an asterisk (*) for statistically significant values (p < 0.05).
    Non-significant values will appear without parentheses but formatted with leading zeros removed.
    """
    if pd.isnull(x):
        return ""
    if p < 0.05:
        return (
            f"{x:.2f}".lstrip("0").replace("-0", "-") + "\n*"
        )  # Significant values with asterisk
    else:
        return f"{x:.2f}".lstrip("0").replace(
            "-0", "-"
        )  # Non-significant values without asterisk


p_value_pivot = top_correlations_all.pivot_table(
    index=["LIWC Feature", "Persuasion Technique"],
    columns=["Dataset"],
    values="p-value",
).T

# Create annotation data for heatmap with significance formatting
annotations = pivot_table.apply(
    lambda col: col.combine(
        p_value_pivot[col.name],  # Match corresponding p-values
        custom_fmt_with_significance,
    )
)

# Increase figure width to reduce label overlap
plt.figure(figsize=(30, 15))  # Adjust width for better alignment

# Generate heatmap
ax = sns.heatmap(
    pivot_table.fillna(0),
    annot=annotations,  # Use the formatted annotations with significance
    fmt="",  # Leave formatting to custom function
    cmap="coolwarm",
    center=0,
    cbar_kws={"label": "Correlation"},
    annot_kws={"fontsize": 16},  # Adjust annotation font size for readability
)

# Adjust the color bar title font size
cbar = ax.collections[0].colorbar
cbar.set_label("Correlation", fontsize=16)

# Adjust the color bar tick labels font size
cbar.ax.tick_params(labelsize=14)

technique_mapping = {
    "Causal_Oversimplification": "Causal Oversimpl.",
    "Questioning_the_Reputation": "Quest. Reputation",
    "False_Dilemma-No_Choice": "False Dilemma",
    "Exaggeration-Minimisation": "Exag./Minim.",
    "Conversation_Killer": "Conv. Killer",
    "Name_Calling-Labeling": "Labelling",
    "Loaded_Language": "Loaded Lang.",
    "Repetition": "Repetition",
    "Appeal_to_Fear-Prejudice": "Appeal Fear",
    "Flag_Waving": "Flag Waving",
    "Doubt": "Doubt",
    "Appeal_to_Authority": "Appeal Authority",
    "Appeal_to_Values": "Appeal Values",
    "Slogans": "Slogans",
}

# Apply the technique mapping to shorten technique names
short_techniques = [
    technique_mapping.get(tup[1], tup[1]) for tup in pivot_table.columns
]
features = [tup[0] for tup in pivot_table.columns]

# Apply the formatted labels with split techniques and features
ax.set_xticklabels(
    short_techniques, rotation=90, ha="center", fontsize=16, color="black"
)
for idx, label in enumerate(features):
    ax.text(
        idx + 0.5,
        -1.0,
        label,
        ha="center",
        fontsize=16,
        rotation=90,
        color="black",
        transform=ax.get_xaxis_transform(),
    )

ax.set_yticklabels(ax.get_yticklabels(), rotation=0, fontsize=16, color="black")

# Title and axis labels
plt.xlabel("Persuasion Technique and LIWC Feature", fontsize=18, labelpad=100)
plt.ylabel("Disinformation Domain", fontsize=18)

plt.tight_layout()
plt.savefig("figures/correlation_heatmap.pdf", bbox_inches="tight", dpi=600)
plt.show()

print(
    "This version shows the highest absolute correlations between LIWC features and persuasion techniques for each dataset."
)

In [None]:
def generate_correlation_summary_with_absolute_and_pvalues(
    correlation_matrix, p_value_matrix, dataset_name, p_value_threshold=0.05
):
    # Flatten the correlation matrix
    flattened = correlation_matrix.stack().reset_index()
    flattened.columns = ["LIWC Feature", "Persuasion Technique", "Correlation"]
    flattened = flattened.dropna()

    # Flatten the p-value matrix
    p_values = p_value_matrix.stack().reset_index(drop=True)
    flattened["p-value"] = p_values

    # Add the dataset name
    flattened["Dataset"] = dataset_name

    # Add absolute correlation column
    flattened["Absolute Correlation"] = flattened["Correlation"].abs()

    # Add interpretation column
    def interpret_correlation(row):
        if row["p-value"] < p_value_threshold:
            return True
        else:
            return False

    flattened["Significant"] = flattened.apply(interpret_correlation, axis=1)
    return flattened


# Generate summary tables for each dataset with p-values and filter for significance
p_value_threshold = 0.05

cidii_summary = generate_correlation_summary_with_absolute_and_pvalues(
    cidii_correlations, cidii_p_values, "Islamic Issues", p_value_threshold
)
covid_summary = generate_correlation_summary_with_absolute_and_pvalues(
    covid_correlations, covid_p_values, "COVID-19", p_value_threshold
)
climate_fever_summary = generate_correlation_summary_with_absolute_and_pvalues(
    climate_fever_correlations,
    climate_fever_p_values,
    "Climate Change",
    p_value_threshold,
)
euvsdisinfo_summary = generate_correlation_summary_with_absolute_and_pvalues(
    euvsdisinfo_correlations,
    euvsdisinfo_p_values,
    "Russo-Ukrainian War",
    p_value_threshold,
)

# Get only significant correlations
cidii_significant = cidii_summary[cidii_summary["Significant"]]
covid_significant = covid_summary[covid_summary["Significant"]]
climate_fever_significant = climate_fever_summary[climate_fever_summary["Significant"]]
euvsdisinfo_significant = euvsdisinfo_summary[euvsdisinfo_summary["Significant"]]

# Ensure 'Absolute Correlation' is numeric in each dataset summary
cidii_significant["Absolute Correlation"] = pd.to_numeric(
    cidii_significant["Absolute Correlation"], errors="coerce"
)
covid_significant["Absolute Correlation"] = pd.to_numeric(
    covid_significant["Absolute Correlation"], errors="coerce"
)
climate_fever_significant["Absolute Correlation"] = pd.to_numeric(
    climate_fever_significant["Absolute Correlation"], errors="coerce"
)
euvsdisinfo_significant["Absolute Correlation"] = pd.to_numeric(
    euvsdisinfo_significant["Absolute Correlation"], errors="coerce"
)

# Remove duplicate rows
cidii_significant = cidii_significant.drop_duplicates(
    ["LIWC Feature", "Persuasion Technique", "Dataset"]
)
covid_significant = covid_significant.drop_duplicates(
    ["LIWC Feature", "Persuasion Technique", "Dataset"]
)
climate_fever_significant = climate_fever_significant.drop_duplicates(
    ["LIWC Feature", "Persuasion Technique", "Dataset"]
)
euvsdisinfo_significant = euvsdisinfo_significant.drop_duplicates(
    ["LIWC Feature", "Persuasion Technique", "Dataset"]
)

# Extract the top 10 absolute correlations for each dataset
# topn = 20
# cidii_significant = cidii_significant.nlargest(topn, 'Absolute Correlation')
# covid_significant = covid_significant.nlargest(topn, 'Absolute Correlation')
# climate_fever_significant = climate_fever_significant.nlargest(topn, 'Absolute Correlation')
# euvsdisinfo_significant = euvsdisinfo_significant.nlargest(topn, 'Absolute Correlation')

# Get all unique pairs of LIWC features and Persuasion Techniques
all_tuples = []
for dataset in [
    cidii_significant,
    covid_significant,
    climate_fever_significant,
    euvsdisinfo_significant,
]:
    all_tuples.extend(
        list(
            [
                tuple(row)
                for row in dataset[["LIWC Feature", "Persuasion Technique"]].values
            ]
        )
    )
all_tuples = list(set(all_tuples))

# Retrieve these features from the summary datasets
cidii_summary = cidii_summary[
    cidii_summary.apply(
        lambda row: (row["LIWC Feature"], row["Persuasion Technique"]) in all_tuples,
        axis=1,
    )
]
covid_summary = covid_summary[
    covid_summary.apply(
        lambda row: (row["LIWC Feature"], row["Persuasion Technique"]) in all_tuples,
        axis=1,
    )
]
climate_fever_summary = climate_fever_summary[
    climate_fever_summary.apply(
        lambda row: (row["LIWC Feature"], row["Persuasion Technique"]) in all_tuples,
        axis=1,
    )
]
euvsdisinfo_summary = euvsdisinfo_summary[
    euvsdisinfo_summary.apply(
        lambda row: (row["LIWC Feature"], row["Persuasion Technique"]) in all_tuples,
        axis=1,
    )
]

# Combine top correlations into a single DataFrame for display
top_correlations_all = pd.concat(
    [cidii_summary, covid_summary, climate_fever_summary, euvsdisinfo_summary]
)

# Sort and remove any remaining duplicates based on feature, technique, and dataset
top_correlations_all = top_correlations_all.sort_values("LIWC Feature", ascending=False)

# Display the final table with filtered significant results
top_correlations_all[
    ["LIWC Feature", "Persuasion Technique", "Dataset", "Correlation", "p-value"]
]

### HERE ###

# Merge significant results from all datasets
merged_significant = (
    cidii_significant.merge(
        covid_significant,
        on=["LIWC Feature", "Persuasion Technique"],
        how="outer",
        suffixes=("_cidii", "_covid"),
    )
    .merge(
        climate_fever_significant,
        on=["LIWC Feature", "Persuasion Technique"],
        how="outer",
        suffixes=("", "_climate"),
    )
    .merge(
        euvsdisinfo_significant,
        on=["LIWC Feature", "Persuasion Technique"],
        how="outer",
        suffixes=("", "_euvsdisinfo"),
    )
)

# Ensure correlation coefficients are filled with 0 if missing
merged_significant = merged_significant.fillna(0)

# Extract correlation columns
merged_significant["Correlation_cidii"] = pd.to_numeric(
    merged_significant["Absolute Correlation_cidii"], errors="coerce"
)
merged_significant["Correlation_covid"] = pd.to_numeric(
    merged_significant["Absolute Correlation_covid"], errors="coerce"
)
merged_significant["Correlation_climate"] = pd.to_numeric(
    merged_significant["Absolute Correlation"], errors="coerce"
)
merged_significant["Correlation_euvsdisinfo"] = pd.to_numeric(
    merged_significant["Absolute Correlation_euvsdisinfo"], errors="coerce"
)

# Compute variance across the datasets for each pair
merged_significant["Variance"] = merged_significant[
    [
        "Correlation_cidii",
        "Correlation_covid",
        "Correlation_climate",
        "Correlation_euvsdisinfo",
    ]
].var(axis=1)

# Compute standard deviation (optional, instead of variance)
merged_significant["Standard Deviation"] = merged_significant[
    [
        "Correlation_cidii",
        "Correlation_covid",
        "Correlation_climate",
        "Correlation_euvsdisinfo",
    ]
].std(axis=1)

# Define the number of top variance pairs to select
top_n = 50  # Adjust this value as needed

# Sort by variance in descending order
merged_significant_sorted = merged_significant.sort_values(
    by="Variance", ascending=False
)

# Select the top_n rows with the highest variance
top_discrepant_pairs = merged_significant_sorted.head(top_n)

# Display relevant columns for interpretation
top_discrepant_pairs_summary = top_discrepant_pairs[
    [
        "LIWC Feature",
        "Persuasion Technique",
        "Correlation_cidii",
        "Correlation_covid",
        "Correlation_climate",
        "Correlation_euvsdisinfo",
        "Variance",
        "Standard Deviation",
    ]
]

top_correlations_all = top_correlations_all[
    top_correlations_all.apply(
        lambda x: (x["LIWC Feature"], x["Persuasion Technique"])
        in list(
            tuple(a)
            for a in top_discrepant_pairs[
                ["LIWC Feature", "Persuasion Technique"]
            ].values
        ),
        axis=1,
    )
]

### HERE ###


# Pivot the data to create a table with datasets as columns
pivot_table = top_correlations_all.pivot_table(
    index=["LIWC Feature", "Persuasion Technique"],
    columns=["Dataset"],
    values="Correlation",
)

pivot_table.index = pd.MultiIndex.from_tuples(
    [(technique, feature) for technique, feature in pivot_table.index],
    names=["Persuasion Technique", "LIWC Feature"],
)

# Transpose the table so the techniques and features are aligned as row levels
pivot_table = pivot_table.T


# Modify the mask to retain all values but annotate non-significant ones distinctly
mask = pd.isnull(pivot_table)

pivot_table = top_correlations_all.pivot_table(
    index=["LIWC Feature", "Persuasion Technique"],
    columns=["Dataset"],
    values="Correlation",
).dropna()

pivot_table.index = pd.MultiIndex.from_tuples(
    [(technique, feature) for technique, feature in pivot_table.index],
    names=["Persuasion Technique", "LIWC Feature"],
)

# Transpose the table so the techniques and features are aligned as row levels
pivot_table = pivot_table.T


# Modify the mask to retain all values but annotate non-significant ones distinctly
mask = pd.isnull(pivot_table)


# Custom annotation formatting function
def custom_fmt_with_significance(x, p):
    """
    Format the annotation based on significance.
    Add an asterisk (*) for statistically significant values (p < 0.05).
    Non-significant values will appear without parentheses but formatted with leading zeros removed.
    """
    if pd.isnull(x):
        return ""
    if p < 0.05:
        return (
            f"{x:.2f}".lstrip("0").replace("-0", "-") + "\n*"
        )  # Significant values with asterisk
    else:
        return f"{x:.2f}".lstrip("0").replace(
            "-0", "-"
        )  # Non-significant values without asterisk


p_value_pivot = top_correlations_all.pivot_table(
    index=["LIWC Feature", "Persuasion Technique"],
    columns=["Dataset"],
    values="p-value",
).T

# Create annotation data for heatmap with significance formatting
annotations = pivot_table.apply(
    lambda col: col.combine(
        p_value_pivot[col.name],  # Match corresponding p-values
        custom_fmt_with_significance,
    )
)

# Increase figure width to reduce label overlap
plt.figure(figsize=(30, 17))  # Adjust width for better alignment

# Generate heatmap
ax = sns.heatmap(
    pivot_table.fillna(0),
    annot=annotations,  # Use the formatted annotations with significance
    fmt="",  # Leave formatting to custom function
    cmap="coolwarm",
    center=0,
    cbar_kws={"label": "Correlation"},
    annot_kws={"fontsize": 16},  # Adjust annotation font size for readability
)

# Adjust the color bar title font size
cbar = ax.collections[0].colorbar
cbar.set_label("Correlation", fontsize=16)

# Adjust the color bar tick labels font size
cbar.ax.tick_params(labelsize=14)

technique_mapping = {
    "Causal_Oversimplification": "Causal Oversimpl.",
    "Questioning_the_Reputation": "Quest. Reputation",
    "False_Dilemma-No_Choice": "False Dilemma",
    "Exaggeration-Minimisation": "Exag./Minim.",
    "Conversation_Killer": "Conv. Killer",
    "Name_Calling-Labeling": "Labelling",
    "Loaded_Language": "Loaded Lang.",
    "Repetition": "Repetition",
    "Appeal_to_Fear-Prejudice": "Appeal Fear",
    "Flag_Waving": "Flag Waving",
    "Doubt": "Doubt",
    "Appeal_to_Authority": "Appeal Authority",
    "Appeal_to_Values": "Appeal Values",
    "Slogans": "Slogans",
}

# Apply the technique mapping to shorten technique names
short_techniques = [
    technique_mapping.get(tup[1], tup[1]) for tup in pivot_table.columns
]
features = [tup[0] for tup in pivot_table.columns]

# Apply the formatted labels with split techniques and features
ax.set_xticklabels(
    short_techniques, rotation=90, ha="center", fontsize=16, color="black"
)
for idx, label in enumerate(features):
    ax.text(
        idx + 0.5,
        -1.0,
        label,
        ha="center",
        fontsize=16,
        rotation=90,
        color="black",
        transform=ax.get_xaxis_transform(),
    )

ax.set_yticklabels(ax.get_yticklabels(), rotation=0, fontsize=16, color="black")

# Title and axis labels
plt.xlabel("Persuasion Technique and LIWC Feature", fontsize=18, labelpad=120)
plt.ylabel("Disinformation Domain", fontsize=18)

plt.tight_layout()
plt.savefig("figures/correlation_heatmap.pdf", bbox_inches="tight", dpi=600)
plt.show()


print(
    "This version shows highest variance in correlation coefficients across datasets."
)

In [None]:
def get_examples(df, persuasion_technique, liwc_feature, n=5):
    """
    Get example sentences from the dataset that contain the given persuasion technique and LIWC feature.

    Parameters:
    - df (pd.DataFrame): DataFrame containing the dataset.
    - persuasion_technique (str): The persuasion technique to search for.
    - liwc_feature (str): The LIWC feature to search for.
    - n (int): Number of examples to retrieve.

    Returns:
    - examples (pd.DataFrame): DataFrame containing example sentences.
    """
    technique_rows = df[df["persuasion_techniques"].str.contains(persuasion_technique)]
    feature_rows = technique_rows[technique_rows[liwc_feature] > 0]
    feature_rows = feature_rows.sort_values(by=liwc_feature, ascending=False)
    return feature_rows.head(n)[["Text", liwc_feature]]

In [None]:
# High use of "cogproc" words can indicate that the text involves reasoning, complex thinking, or explanation.
get_examples(climate_fever_liwc_df, "Appeal_to_Authority", "cogproc", n=20).apply(
    lambda x: print(x["Text"]), axis=1
);

# NASA satellite data from the years 2000 through 2011 show the Earth's atmosphere is allowing far more heat to be released into space than alarmist computer models have predicted, reports a new study in the peer-reviewed science journal Remote Sensing
# More than half of the 44 studies selected for publication found that raised levels of CO2 had little or no impact on marine life, including crabs, limpets, sea urchins and sponges
# Some scientists believe that solar activity is more likely to influence today’s climate than carbon dioxide, and Dr Soon has compiled data showing temperature in America, Canada and Mexico rises and falls in line with solar activity.

In [None]:
# identifies words that express agreement or affirmation. This category includes terms such as "agree," "OK," and "yes." The presence of assent words in a text can indicate a speaker's or writer's concurrence or acceptance.
# Conversation killer statements that effectively end or stifle further discussion.

df = get_examples(cidii_liwc_df, "Conversation_Killer", "assent", n=20)
df.apply(lambda x: print(x["Text"]), axis=1)
df

# There is absolutely no way to verify anything.
# Absolutely nothing!!!

In [None]:
df = get_examples(climate_fever_liwc_df, "Appeal_to_Fear-Prejudice", "conflict", n=20)
df.apply(lambda x: print(x["Text"]), axis=1)
df

In [None]:
df = get_examples(climate_fever_liwc_df, "Appeal_to_Fear-Prejudice", "death", n=10)
df.apply(lambda x: print(x["Text"]), axis=1)
df

In [None]:
df = get_examples(climate_fever_liwc_df, "Appeal_to_Values", "emo_pos", n=10)
df.apply(lambda x: print(x["Text"]), axis=1)
df