# Analysis of Effects from Artilces of Posts

In [1]:
import numpy as np
import pandas as pd
from scipy.stats import kruskal
from joblib import Parallel, delayed
import plotly.graph_objects as go

Load and specify data for analysis

In [2]:
#set format to float with 4 decimals
pd.set_option('display.float_format', '{:.4f}'.format)

#read data
sentiments = pd.read_csv("sentiment_results.csv")

#smotions of articles(_A) and posts(_P) included in the analysis 
emotions_a = ['Anger_A', 'Fear_A', 'Disgust_A', 'Joy_A', 'None_A']
emotions_p = ['Anger_P', 'Fear_P', 'Disgust_P', 'Joy_P', 'None_P']

#filter relevant colums
sentiments = sentiments[emotions_a + ['NewsroomTopic'] + emotions_p]

Global Variables

In [3]:
#set for optimal cpu utilisation
parallel_jobs = 10

#ranking parameters for emotions in articles
between_ranks = 2
rank_threshold = 0.1

#settings to account for testing errors
alpha = 0.01
bootstrap_samples = 10000

#reproducibility aspects 
seed = 666

#total observations
n = len(sentiments)

#create numpy arrays for emotions and topics in posts
dv_arrays = {}

for dv in emotions_p:
    dv_arrays[dv] = sentiments[dv].to_numpy()

topics = sentiments['NewsroomTopic'].to_numpy()
topic_levels = np.unique(topics)

Test Functions

In [4]:
#perform kruskall-wallis test
def kruskal_test(groups):

    clean_groups = []

    for g in groups:
        if len(g) > 0:
            clean_groups.append(g)

    if len(clean_groups) <= 1:
        return np.nan, np.nan
    
    stat, p = kruskal(*clean_groups)
    n_total = 0
    
    for g in clean_groups:
        n_total += len(g)
        
    return stat / (n_total - 1), p

In [5]:
#return statistics from bootstrapping
def bootstrap_stats(eta_list, p_list):

    eta = np.array(eta_list)
    p = np.array(p_list)
    eta_mean = np.nanmean(eta)
    eta_ci_lower = np.nanpercentile(eta, 100 * alpha / 2)
    eta_ci_upper = np.nanpercentile(eta, 100 * (1 - alpha / 2))
    p_mean = np.nanmean(p)
    significance = 0
    
    for pv in p:
        if pv <= alpha:
            significance += 1
            
    significance = significance / len(p) if len(p) > 0 else np.nan

    return eta_mean, eta_ci_lower, eta_ci_upper, p_mean, significance

Processing Rank Based Emotions

In [6]:
def emotion_bootstrap(seed_value, dv, iv_emo):
    rng = np.random.default_rng(seed_value)
    idx = rng.choice(n, size=n, replace=True)
    dv_data = dv_arrays[dv][idx]
    iv_data = sentiments[iv_emo].to_numpy()[idx]
    topic_data = topics[idx]

    if rank_threshold == 0:
        num_ranks = between_ranks
    else:
        num_ranks = between_ranks + 2

    middle_edges = np.linspace(rank_threshold, 1 - rank_threshold, between_ranks + 1)
    edges = np.concatenate([[0], middle_edges, [1]])
    rank_data = pd.cut(iv_data, bins=edges, labels=False, include_lowest=True) + 1
    eta_vals = []
    p_vals = []

    for lvl in topic_levels:
        mask = topic_data == lvl
        groups = []

        for r in range(1, num_ranks + 1):
            group = dv_data[(mask) & (rank_data == r)]
            groups.append(group)
        eta, p = kruskal_test(groups)
        
        if not np.isnan(eta):
            eta_vals.append(eta)
            p_vals.append(p)

    eta_mean = np.mean(eta_vals) if len(eta_vals) > 0 else np.nan
    p_mean = np.mean(p_vals) if len(p_vals) > 0 else np.nan

    return eta_mean, p_mean

In [7]:
def emotion_parallel(dv, iv_emo):

    results = Parallel(n_jobs=parallel_jobs)(
        delayed(emotion_bootstrap)(seed + i, dv, iv_emo)
        for i in range(bootstrap_samples)
    )
    eta_vals = []
    p_vals = []
    
    for e, p in results:
        eta_vals.append(e)
        p_vals.append(p)
        
    return bootstrap_stats(eta_vals, p_vals)

Processing Categorial Factor NewsroomTopic

In [8]:
def topic_bootstrap(seed_value, dv):

    rng = np.random.default_rng(seed_value)
    idx = rng.choice(n, size=n, replace=True)
    dv_data = dv_arrays[dv][idx]
    topic_data = topics[idx]
    groups = []
    
    for lvl in topic_levels:
        group = dv_data[topic_data == lvl]
        groups.append(group)

    eta, p = kruskal_test(groups)
    
    return eta, p

In [9]:
def topic_parallel(dv):

    results = Parallel(n_jobs=parallel_jobs)(
        delayed(topic_bootstrap)(seed + i, dv)
        for i in range(bootstrap_samples)
    )
    eta_vals = []
    p_vals = []
    
    for e, p in results:
        eta_vals.append(e)
        p_vals.append(p)
        
    return bootstrap_stats(eta_vals, p_vals)

Summarize Testings with Structured Output

In [10]:
def kruskal_wallis_bootstrap():

    rows = []
    
    for dv in emotions_p:

        for iv_emo in emotions_a:
            m, lo, hi, mp, ps = emotion_parallel(dv, iv_emo)
            row = {
                "criterion": dv,
                "predictor": iv_emo,
                "type": "emotion",
                "eta2_mean": m,
                "eta2_ci_lower": lo,
                "eta2_ci_upper": hi,
                "mean_p_value": mp,
                "significance": ps
            }
            rows.append(row)

        m, lo, hi, mp, ps = topic_parallel(dv)
        row = {
            "criterion": dv,
            "predictor": "NewsroomTopic",
            "type": "NewsroomTopic",
            "eta2_mean": m,
            "eta2_ci_lower": lo,
            "eta2_ci_upper": hi,
            "mean_p_value": mp,
            "significance": ps
        }
        rows.append(row)

    return pd.DataFrame(rows)

# Run Analysis

In [11]:
results = kruskal_wallis_bootstrap()

Results for anger in Posts

In [12]:
anger = results[results['criterion'] == 'Anger_P']
anger

Unnamed: 0,criterion,predictor,type,eta2_mean,eta2_ci_lower,eta2_ci_upper,mean_p_value,significance
0,Anger_P,Anger_A,emotion,0.0062,0.005,0.0075,0.0002,0.9964
1,Anger_P,Fear_A,emotion,0.0033,0.0025,0.0043,0.0439,0.1827
2,Anger_P,Disgust_A,emotion,0.0002,0.0001,0.0005,0.3063,0.0
3,Anger_P,Joy_A,emotion,0.0011,0.0007,0.0016,0.1329,0.0001
4,Anger_P,None_A,emotion,0.0031,0.0023,0.0042,0.0455,0.1708
5,Anger_P,NewsroomTopic,NewsroomTopic,0.0118,0.0107,0.0129,0.0,1.0


Results for disgust in Posts

In [13]:
fear = results[results['criterion'] == 'Fear_P']
fear

Unnamed: 0,criterion,predictor,type,eta2_mean,eta2_ci_lower,eta2_ci_upper,mean_p_value,significance
6,Fear_P,Anger_A,emotion,0.0049,0.0041,0.0059,0.0282,0.3714
7,Fear_P,Fear_A,emotion,0.0131,0.0116,0.0147,0.0001,0.9987
8,Fear_P,Disgust_A,emotion,0.0007,0.0004,0.001,0.2436,0.0
9,Fear_P,Joy_A,emotion,0.0027,0.0021,0.0034,0.0318,0.3006
10,Fear_P,None_A,emotion,0.0104,0.0091,0.012,0.0366,0.2653
11,Fear_P,NewsroomTopic,NewsroomTopic,0.0302,0.0284,0.032,0.0,1.0


Results for disgust in Posts

In [14]:
disgust = results[results['criterion'] == 'Disgust_P']
disgust

Unnamed: 0,criterion,predictor,type,eta2_mean,eta2_ci_lower,eta2_ci_upper,mean_p_value,significance
12,Disgust_P,Anger_A,emotion,0.0044,0.0035,0.0055,0.0008,0.984
13,Disgust_P,Fear_A,emotion,0.0029,0.0023,0.0037,0.0489,0.1339
14,Disgust_P,Disgust_A,emotion,0.0021,0.0016,0.0026,0.0427,0.2049
15,Disgust_P,Joy_A,emotion,0.0008,0.0005,0.0013,0.1044,0.005
16,Disgust_P,None_A,emotion,0.002,0.0014,0.0027,0.018,0.5369
17,Disgust_P,NewsroomTopic,NewsroomTopic,0.0082,0.0073,0.0091,0.0,1.0


Results for joy in Posts

In [15]:
joy = results[results['criterion'] == 'Joy_P']
joy

Unnamed: 0,criterion,predictor,type,eta2_mean,eta2_ci_lower,eta2_ci_upper,mean_p_value,significance
18,Joy_P,Anger_A,emotion,0.0033,0.0026,0.0041,0.0359,0.2619
19,Joy_P,Fear_A,emotion,0.0114,0.01,0.0129,0.0019,0.9507
20,Joy_P,Disgust_A,emotion,0.0003,0.0001,0.0005,0.3303,0.0
21,Joy_P,Joy_A,emotion,0.0038,0.0029,0.0047,0.0208,0.4984
22,Joy_P,None_A,emotion,0.0123,0.0109,0.0138,0.0288,0.3549
23,Joy_P,NewsroomTopic,NewsroomTopic,0.0429,0.0407,0.045,0.0,1.0


Results for none in Posts

In [16]:
none = results[results['criterion'] == 'None_P']
none

Unnamed: 0,criterion,predictor,type,eta2_mean,eta2_ci_lower,eta2_ci_upper,mean_p_value,significance
24,None_P,Anger_A,emotion,0.0015,0.001,0.0021,0.07,0.038
25,None_P,Fear_A,emotion,0.0061,0.0051,0.0072,0.0053,0.8549
26,None_P,Disgust_A,emotion,0.0002,0.0001,0.0004,0.4082,0.0
27,None_P,Joy_A,emotion,0.0013,0.0008,0.0019,0.0666,0.0567
28,None_P,None_A,emotion,0.0074,0.0063,0.0086,0.0008,0.9833
29,None_P,NewsroomTopic,NewsroomTopic,0.0106,0.0096,0.0117,0.0,1.0


Plots

In [26]:
def plot_results_dependent():

    predictor_order = results.groupby('predictor')['eta2_mean'].mean().sort_values(ascending=False).index
    criteria = results.groupby('criterion')['eta2_mean'].mean().sort_values(ascending=False).index
    n_crit = len(criteria)
    n_pred = len(predictor_order)
    total_width = 0.8
    bar_width = total_width / n_pred
    fig = go.Figure()

    for i, pred in enumerate(predictor_order):
        x_positions = []
        y_values = []
        error_y_lower = []
        error_y_upper = []
        hover_texts = []

        for j, crit in enumerate(criteria):
            row = results[(results['predictor'] == pred) & (results['criterion'] == crit)]

            if row.empty:
                x_positions.append(j)
                y_values.append(0)
                error_y_lower.append(0)
                error_y_upper.append(0)
                hover_texts.append(f"{pred}<br>{crit}<br>No data")

                continue

            eta = row['eta2_mean'].values[0]
            ci_lower = row['eta2_ci_lower'].values[0]
            ci_upper = row['eta2_ci_upper'].values[0]
            p_val = row['mean_p_value'].values[0]
            x = j - total_width/2 + i*bar_width + bar_width/2
            x_positions.append(x)
            y_values.append(eta)
            error_y_lower.append(eta - ci_lower)
            error_y_upper.append(ci_upper - eta)
            hover_texts.append(
                f"<b>Predictor:</b> {pred}<br>"
                f"<b>Criterion:</b> {crit}<br>"
                f"<b>Eta²:</b> {eta:.3f}<br>"
                f"<b>CI:</b> [{ci_lower:.3f}, {ci_upper:.3f}]<br>"
                f"<b>Mean p-value:</b> {p_val:.4f}"
            )

        fig.add_trace(go.Bar(
            x=x_positions,
            y=y_values,
            name=pred,
            width=bar_width * 0.95,
            error_y=dict(
                type='data',
                symmetric=False,
                array=error_y_upper,
                arrayminus=error_y_lower,
                color='black',
                thickness=1.5,
                width=5
            ),
            hovertext=hover_texts,
            hoverinfo="text"
        ))

    tick_positions = np.arange(n_crit)
    fig.update_layout(
        xaxis=dict(
            tickmode='array',
            tickvals=tick_positions,
            ticktext=criteria,
            title="Dependent Variable"
        ),
        yaxis=dict(title="Kruskal-Wallis eta²"),
        barmode='group',
        title="Effects by Criterion with CI",
        legend_title_text="Predictor",
        width=1000,
        height=600
    )
    fig.show()

plot_results_dependent()

In [None]:
def plot_results_independent():
    predictor_order = results.groupby('predictor')['eta2_mean'].mean().sort_values(ascending=False).index
    n_pred = len(predictor_order)
    
    criteria = results.groupby('criterion')['eta2_mean'].mean().sort_values(ascending=False).index
    n_crit = len(criteria)

    colors = ['#636EFA', '#EF553B', '#00CC96', '#AB63FA', '#FFA15A', '#19D3F3', '#FF6692']
    criterion_colors = {}
    for i, crit in enumerate(criteria):
        criterion_colors[crit] = colors[i % len(colors)]

    total_width = 0.8
    bar_width = total_width / n_crit

    fig = go.Figure()

    for j, crit in enumerate(criteria):
        x_positions = []
        y_values = []
        error_y_lower = []
        error_y_upper = []
        hover_texts = []

        for i, pred in enumerate(predictor_order):
            row = results[(results['predictor'] == pred) & (results['criterion'] == crit)]

            if row.empty:
                eta = 0
                ci_lower = 0
                ci_upper = 0
                p_val = np.nan

            else:
                eta = row['eta2_mean'].values[0]
                ci_lower = row['eta2_ci_lower'].values[0]
                ci_upper = row['eta2_ci_upper'].values[0]
                p_val = row['mean_p_value'].values[0]

            x = i - total_width/2 + j*bar_width + bar_width/2
            x_positions.append(x)
            y_values.append(eta)
            error_y_lower.append(eta - ci_lower)
            error_y_upper.append(ci_upper - eta)
            hover_texts.append(
                f"<b>Predictor:</b> {pred}<br>"
                f"<b>Criterion:</b> {crit}<br>"
                f"<b>Eta²:</b> {eta:.3f}<br>"
                f"<b>CI:</b> [{ci_lower:.3f}, {ci_upper:.3f}]<br>"
                f"<b>Mean p-value:</b> {p_val:.4f}"
            )

        fig.add_trace(go.Bar(
            x=x_positions,
            y=y_values,
            name=crit,
            marker_color=criterion_colors[crit],
            width=bar_width*0.9,
            error_y=dict(
                type='data',
                symmetric=False,
                array=error_y_upper,
                arrayminus=error_y_lower,
                color='black',
                thickness=1.5,
                width=5
            ),
            hovertext=hover_texts,
            hoverinfo='text'
        ))

    tick_positions = []
    tick_texts = []

    for i, pred in enumerate(predictor_order):
        tick_positions.append(i)
        tick_texts.append(pred)

    fig.update_layout(
        xaxis=dict(
            tickmode='array',
            tickvals=tick_positions,
            ticktext=tick_texts,
            title="Predictor"
        ),
        yaxis=dict(title="Kruskal-Wallis eta²"),
        barmode='group',
        title="Effects by Predictor with CI",
        legend_title_text="Criterion",
        width=1000,
        height=600
    )
    fig.show()

plot_results_independent()

Save Results as CSV

In [None]:
#results.to_csv('results.csv', index=False)