# placeholder {.hidden}

In [None]:
#| eval: false
import os
from pathlib import Path
import gc

import pandas as pd
import numpy as np

import re
from utils.fighting_words import bayes_compare_language as compute_fighting_words
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords

from statsmodels.stats.proportion import proportion_confint
from scipy.stats import chi2_contingency, fisher_exact
from itertools import combinations, permutations

base_path = Path.cwd().parent.parent

import sys
sys.path.append(str(base_path / 'paper'))

from reporting import *

from IPython.display import HTML
latex_table = lambda x, *args, **kwargs: HTML(x.to_html(**kwargs))

# Define base directory (paper/secs -> go up two levels to project root)
data_path = base_path / 'data'
labeled_path = data_path / 'labeled'
intermediate_path = data_path / 'intermediate'
manifestos_path = data_path / 'manifestos'
annotations_path = data_path / 'annotations' # / 'group_mention_categorization'

In [None]:
#| cache: true
fp = labeled_path / 'labeled_mentions_with__party_metadata.pkl'
df = pd.read_pickle(fp)

# NOTE: subsetting to **non-universal attributes**
df_any = df[df[label_cols].any(axis=1)]

## Differences between Green and Populist Radical-Right party families

Above, we have validated our hierarchical attribute-centered group mention labeling scheme and documented with it that attribute categories' vary in their overall prevalence in party manifestos as well as their tendencies to be used intersectionally through combinations with other attributes.
<!-- . -->
We turn next to the question how much parties' strategic calculus and group focus strategies contribute to the observed patterns of attribute prevalence and co-occurrence in party manifestos.
On the one hand, it is very likely that the cultural, socio-political, historic, and thematic context act as constraints on parties possibilities to combine different attributes creatively to refer to social groups in their election programs.
On the other hand, the literature on political parties' group-based strategies suggests that parties strategically select which social groups to appeal to in their election programs [@cites].
We examine this question next by comparing attribute prevalence and co-occurrence patterns between the Green and Populist Radical-Right (PRR) party families.  

In [None]:
# Normalize party family labels
family_map = {
    'prrp': 'Populist Radical-Right', 
    'green': 'Green',
    'con': 'Conservative',
    'sd': 'Social Democratic',
}

def normalize_family(x):
    if pd.isna(x):
        return None
    lx = str(x).strip().lower()
    return family_map.get(lx, None)

if 'party_family' not in df.columns:
    raise ValueError("party_family column missing in df. Cannot compute prevalence by family.")

In [None]:
fam_hue_order = ['Populist Radical-Right', 'Green']

# use colors for diverging colormap (PRGn) to ensure consistency with heatmaps
norm = mcolors.Normalize(vmin=-1, vmax=1)
fam_col_palette = {
    'Populist Radical-Right': mcolors.to_hex(plt.cm.PRGn(norm(-0.7))), 
    'Green': mcolors.to_hex(plt.cm.PRGn(norm(0.7)))
}

In [None]:
all_fam_order = fam_hue_order.copy()
all_fam_order += ['Conservative', 'Social Democratic']
all_fam_palette = fam_col_palette.copy()
all_fam_palette.update({
    'Conservative': '#377eb8',      # Blue for conservatives
    'Social Democratic': '#e41a1c',       # Red for social democrats
})

In [None]:
# Prepare filtered dataset
fam_df = df.copy()
fam_df['party_family_label'] = fam_df['party_family'].apply(normalize_family)
# TODO: reconsider this choice
fam_df = fam_df[fam_df['party_family_label'].isin(['Populist Radical-Right', 'Green'])].copy()

fam_df['economic'] = fam_df[econ_attrs].any(axis=1).astype(int)
fam_df['non-economic'] = fam_df[nonecon_attrs].any(axis=1).astype(int)
fam_df['universal'] = (~fam_df[label_cols].any(axis=1)).astype(int)

dim_fam_prev = compute_prevalence(
    fam_df, 
    ['economic', 'non-economic', 'universal'], 
    group_by='party_family_label'
)
dim_fam_prev.attribute = pd.Categorical(dim_fam_prev.attribute, categories=['non-economic', 'economic', 'universal'], ordered=True)

In [None]:
# Prepare filtered dataset
fam_df = df_any.copy()
fam_df['party_family_label'] = fam_df['party_family'].apply(normalize_family)
# TODO: reconsider this choice
fam_df = fam_df[fam_df['party_family_label'].isin(['Populist Radical-Right', 'Green'])].copy()

In [None]:
# Compute prevalence by party family using the generalized function
econ_fam_prev = compute_prevalence(fam_df, econ_attrs, group_by='party_family_label')
nonecon_fam_prev = compute_prevalence(fam_df, nonecon_attrs, group_by='party_family_label')

# Map to readable names
econ_fam_prev['attribute'] = econ_fam_prev['attribute'].map(attribute_category_names_map)
nonecon_fam_prev['attribute'] = nonecon_fam_prev['attribute'].map(attribute_category_names_map)

# Verify CI bounds
assert econ_fam_prev.query("prevalence < ci_low | prevalence > ci_high").empty
assert nonecon_fam_prev.query("prevalence < ci_low | prevalence > ci_high").empty

In [None]:
#| label: fig-prevalence_by_family
#| output: true
#| fig-cap: "Prevalence of attribute dimensions, economic and non-economic attributes in social group mentions in Populist Radica-Right and Green parties' election manifestos. Bars show share of mentions containing each attribute, with 95% confidence intervals. *Note:* Top panel shows prevalence in all mentions, while middle and bottom panels show prevalence in mentions containing at least one attribute."

# Plot settings
# Prepare subplot layout with dynamic heights
n_econ = econ_fam_prev['attribute'].nunique()
n_nonecon = nonecon_fam_prev['attribute'].nunique()
r_ = 0.3
fig_height = r_ * 3 + r_ * n_econ + r_ * n_nonecon + 2
fig, axes = plt.subplots(
    nrows=3,
    ncols=1,
    figsize=(6, fig_height),
    gridspec_kw={'height_ratios': [3, n_econ, n_nonecon], 'hspace': 0.2},
    sharex=True
)

# Plot using the generalized function
plot_prevalence_bars(
    dim_fam_prev,
    axes[0],
    'Attribute dimensions',
    attribute_order=['universal', 'economic', 'non-economic'],
    hue_col='party_family_label',
    hue_order=fam_hue_order,
    palette=fam_col_palette,
    xlim=(0, .55)
)

# Plot using the generalized function
plot_prevalence_bars(
    econ_fam_prev,
    axes[1],
    'Economic attributes',
    hue_col='party_family_label',
    hue_order=fam_hue_order,
    palette=fam_col_palette,
    xlim=(0, .55)
)

plot_prevalence_bars(
    nonecon_fam_prev,
    axes[2],
    'Non-economic attributes',
    hue_col='party_family_label',
    hue_order=fam_hue_order,
    palette=fam_col_palette,
    xlim=(0, .55)
)

# Add shared legend below the lower plot
handles = [plt.Rectangle((0,0), 1,1, fc=fam_col_palette[fam]) for fam in fam_hue_order]
labels = fam_hue_order
fig.legend(handles, labels, loc='lower center', bbox_to_anchor=(0.5, -0.01), ncol=2, frameon=False)

# plt.tight_layout()
plt.show()

@fig-prevalence_by_family reports the prevalence of attribute dimensions and categories in PRR and Green parties' manifestos, revealing several noteworthy patterns.
First, we find no or only minor differences in these two party families' overall tendency to characterize groups they mention with economic or non-economic attributes.
In particular, while Green parties tend to use "universal" group references to slightly more frequently than PRR parties, the magnitude of this difference does arguably not indicate a substantially higher degree of "universalism" in Green parties' group focus strategies.

However, notable differences between PRR and Green parties arise when focusing on the prevalence of specific attribute categories in the set of group mentions with at least one attribute.
In the case of economic attributes, PRR parties focus more heavily on _occupation/profession_ and less on _income/wealth/economic status_ than Green parties.
This is consistent with the notion that PRR parties' group focus strategies are more oriented towards appealing to specific occupational groups (e.g., "working class", "small business owners") and less towards emphasizing economic inequality and redistribution as a core issue.
Notably, despite its well-established role as a driver of voting behavior and its rising issue salience in recent decades [@cite], _education_ is relatively infrequently expressed in social group mentions by both party families.

Regarding non-economic attributes, PRR parties emphasize _nationality_ and _crime_ more prominently than Green parties, while Green parties place greater emphasis on _age_ and _gender/sexuality_ as salient attributes for identifying and mobilizing social groups.
However, _religion_ and _ethnicity_ are very uncommon in both party families' group mentions, and any prevalence differences between them are thus in the permille range.

@fig-prevalence_by_family_w_mainstream in the Supporting Materials extends this comparison by adding prevalence estimates for center-right/conservative and center-left/social democratic mainstream parties based on data from three countries (Germany, Sweden, and the UK).
This comparison reveals how Green and PRR parties differentiate through their focus on certain group attribute categories from their mainstream competitors.
It shows that Green parties' in these countries tend to differentiate through their non-economic attribute emphasis profile, which is characterized by a pronounced focus on _gender/sexuality_ and little emphasis on _crime_.
In contrast, their economic profile is very similar to that of Social Democratic parties, with similarly strong focus on _income/wealth/economic status_ albeit somewhat less emphasis on _employment status_.
PRR parties, in turn, distinguish themselves primarily through their emphasis on _occupation/profession_ and _nationality_ and thus adopt a intersectional differentiation strategy.
Interestingly, their focus on _crime_ is not markedly different from that of conservative/center-right parties, suggesting this is not a unique dimension of PRR parties group attribut focus.

Further, the differences between PRR and Green parties' group attribute focus shown in @fig-prevalence_by_family exhibit interesting temporal patterns.
For example, @fig-prevalence_trends_economic shows that while their relative emphasis differences are overall very stable across decades, PRR parties' overall stronger focus on the _occupation/profession_ attribute is more pronounced in the 1970s than in later decades.
Regarding non-economic attribute prevalence, @fig-prevalence_trends_noneconomic shows that Greens parties' _age_ focus stands out especially in the 2000s and 2010s, whereas their _gender/sexuality_ focus has been relatively more pronounced before the 2000s. 
PRR parties' group focus, in turn, shows a sharp increase in the prevalence of _family_ from 2010s to 2020s.

We next turn to attribute category co-occurrence patterns in Green and Populist PRR parties' group mentions.
In @sec-prevalence_intersectionality, we have shown that the co-occurrence patterns of attributes in group mentions differ across attribute categories and dimensions and reveals interesting patterns of intersectionality in parties' group focus strategies.
Comparing these patterns between PRR and Green parties, in turn, allows us examining these parties group focus strategies through the lens of intersectionality.

To compare the co-occurrence patterns of attributes in group mentions between PRR and Green parties, we compute the differences in conditional probabilities of attribute co-occurrence between these two party families.
Conditional probabilities measure how likely a group mention contains attribute B when it contains attribute A.
This quantitity is ideal for quantitative-descriptive comparisons between party families as it provides a comparable and interpretable quantity that answers the question: How likely is it that a party uses attribute B to describe a group when it has choosen to describe it with attribute A?[^fn:cooc_metrics]

[^fn:cooc_metrics]: We discuss the rationale for using conditional probabilities as a co-occurrence metric in more detail in the Supplementary Materials, @sec-cooc_metrics.

In [None]:
# Compute associations for all attributes (within and across dimensions)
assoc_all = compute_attribute_associations(df_any, econ_attrs + nonecon_attrs)

# Extract key measures
cpr_all = assoc_all["p_b_given_a"]
pmi_all = assoc_all["pmi"]  # PMI
ppmi_all = assoc_all["ppmi"]  # Positive PMI (co-occurrence above chance)
npmi_all = assoc_all["npmi"]  # Normalized PMI (range [-1, 1])

In [None]:
# #| label: fig-attribute_cpr_overall
# #| output: true
# #| fig-cap: 'Conditional probabilities of attribute co-occurrence in social group mentions. Heatmap cells show P(B|A), the probability of mentioning attribute B given that attribute A is mentioned. Rows indicate "attribute A" (the conditioning attribute) and columns indicate "attribute B" (the outcome attribute). *Note:* Values below 0.01 are not displayed.'

# cpr_all_renamed = cpr_all.rename(index=attribute_category_names_map, columns=attribute_category_names_map)

# fig, axes = plot_heatmap(
#     x=cpr_all_renamed,  # Your data with readable labels
#     panel_groups=(econ_attr_names, nonecon_attr_names),
#     cluster_rows=False,
#     cluster_cols=False,
#     cmin=0.01,
#     cmap='YlOrRd',
#     clims=(0, 1),
#     clegend_title='Conditional Probability P(B|A)'
# )
# axes[0].set_ylabel("economic\n", fontweight='bold')
# axes[1].set_ylabel("non-economic\n", fontweight='bold')
# plt.show()

In [None]:
#| label: fig-attribute_cpr_differences
#| output: true
#| fig-cap: 'Differences in conditional probabilities of attribute co-occurrence between Green and Populist Radica-Right (PRR) party manifestos. Heatmap cells show the difference in P(B|A) between Green and PRR parties (Green – PRR). Rows indicate "attribute A" (the conditioning attribute) and columns indicate "attribute B" (the outcome attribute). Positive values (green) indicate that attribute B is more likely to be mentioned given attribute A in Green party manifestos compared to PRR manifestos, while negative values (purple) indicate the opposite. *Note:* Values below 0.01 in absolute value are not displayed.'

# Compute for both parties
df_prrp = df_any.query("party_family=='prrp'")
df_green = df_any.query("party_family=='green'")

cpr_prrp = compute_attribute_associations(df_prrp, label_cols)["p_b_given_a"]
cpr_green = compute_attribute_associations(df_green, label_cols)["p_b_given_a"]

cpr_diffs = cpr_green - cpr_prrp

cpr_diffs.rename(index=attribute_category_names_map, columns=attribute_category_names_map, inplace=True)

r_ = 0.3
# Use plot_heatmap function
fig, axes = plot_heatmap(
    x=cpr_diffs,
    panel_groups=(econ_attr_names, nonecon_attr_names),
    mask_diagonal=True,
    clims=(-r_, +r_),
    cmin=0.01,
    clegend_title="conditional probability difference\n(PRR vs. Green parties)",
)
axes[0].set_ylabel("economic\n", fontweight='bold')
axes[1].set_ylabel("non-economic\n", fontweight='bold')
plt.show()

@fig-attribute_cpr_differences reports the results of this descriptive analysis. <!-- of the differences in conditional probabilities of attribute co-occurrence between Green and PRR party manifestos.-->
Positive values (green) indicate that attribute B (shown in columns) is more likely to be mentioned given attribute A (shown in rows) in Green party manifestos compared to PRR manifestos, while negative values (purple) indicate the opposite.[^fn:attribute_association_difference_significance]

[^fn:attribute_association_difference_significance]: 
    Please refer to @fig-attribute_association_difference_significance for the results of statistical tests that assess whether observed differences in parties co-mentioning patterns are statistically and substantively significant.

Several interesting patterns stand out in @fig-attribute_cpr_differences.
The purple-shaded vertical band in column _nationality_ clearly shows that PRR parties are overall more likely to combine _nationality_ with a range of other attributes than Green parties.
This difference applies to the non-economic attribute categories _place/location_, _shared_values/mentalities_,_ethnicity_, _crime_, and _religion_ as well as economic attribute categories.
Further, PRR parties' focus on _occupation/profession_ is also slighty higher than that of Green parties when they refer to non-economic group attributes like _nationality_ or _place/location_.

Green parties, in turn, are comparatively more likely to mention _age_ when they refer to _family_.
They are more inclined to mention the attributes _ethnicity_, _gender/sexuality_, and _health_ when they refer to a _religion_-based group, and _family_, _gender/sexuality_, and _health_ when the mention _ethnicity_.
Last but not least, they are more likely to mention _gender/sexuality_ when they refer to _crime_-related groups, such as victims or perpetrators.


Viewed together, the comparative analysis of attribute prevalence and co-occurrence patterns in Green and PRR parties' manifestos reveals that these two party families differ in their group focus strategies in several interesting ways.
More generally, it demonstrates the added analytical value of our attribute-centered taxonomy for the analysis of parties' group focus strategies, as it allows us to capture and compare the intersectional nature of parties' group appeals in a systematic and detailed way.

# APPENDIX

## Differences between Green and Populist Radical-Right party families

In [None]:
# #| label: fig-n_attrs_stats_by_family_by_country
# #| output: false
# #| fig-cap: "tbd"

# # compute average number of attributes by decade
# n_attrs_stats_by_family = df[['country_iso3c', 'party_family']].copy()
# n_attrs_stats_by_family['n_attributes'] = df[label_cols].sum(axis=1)
# n_attrs_stats_by_family = n_attrs_stats_by_family.query("party_family in ['prrp', 'green']")

# n_attrs_stats_by_family = n_attrs_stats_by_family.value_counts(['country_iso3c', 'party_family', 'n_attributes']).sort_index().reset_index()
# n_attrs_stats_by_family['n_attributes'] = n_attrs_stats_by_family['n_attributes'].astype(str)
# n_attrs_stats_by_family.loc[~n_attrs_stats_by_family.n_attributes.isin(["0", "1"]), "n_attributes"] = "≥2"
# n_attrs_stats_by_family['n_attributes'] = pd.Categorical(n_attrs_stats_by_family.n_attributes, categories=["0", "1", "≥2"], ordered=True)
# n_attrs_stats_by_family = n_attrs_stats_by_family.groupby(['country_iso3c', 'party_family', 'n_attributes'], observed=True).agg({'count': 'sum'}).reset_index()
# # TODO: add confidence intervals
# n_attrs_stats_by_family = n_attrs_stats_by_family.groupby(['country_iso3c', 'party_family']).apply(lambda x: x.assign(share=x['count']/x['count'].sum())).reset_index(drop=True)

# # Create country-specific subplots with horizontal stacked bars
# countries = sorted(n_attrs_stats_by_family['country_iso3c'].unique())
# families = ['prrp', 'green']
# family_labels = {'prrp': 'Populist Radical-Right', 'green': 'Green'}
# colors = {'0': '#d73027', '1': '#fee08b', '≥2': '#1a9850'}
# attr_categories = ['0', '1', '≥2']

# # Calculate grid dimensions
# n_countries = len(countries)
# n_cols = 3
# n_rows = int(np.ceil(n_countries / n_cols))

# # Fixed panel dimensions
# panel_width = 3.5
# panel_height = 1.2
# fig, axes = plt.subplots(n_rows, n_cols, figsize=(panel_width * n_cols, panel_height * n_rows),
#                          sharey=True, sharex=True)
# axes = axes.flatten() if n_countries > 1 else [axes]

# # Plot each country in its own subplot
# for idx, country in enumerate(countries):
#     ax = axes[idx]
    
#     # Get data for this country
#     country_data = n_attrs_stats_by_family[n_attrs_stats_by_family['country_iso3c'] == country]
    
#     # Set up y positions for the two party families
#     y_pos = np.arange(len(families))
#     bar_height = 0.6
    
#     # Plot each party family as a separate bar
#     for fam_idx, fam in enumerate(families):
#         fam_data = country_data[country_data['party_family'] == fam]
        
#         # Stack attribute categories horizontally
#         left_offset = 0
#         for attr_cat in attr_categories:
#             subset = fam_data[fam_data['n_attributes'] == attr_cat]
#             share = subset['share'].values[0] if len(subset) > 0 else 0
            
#             ax.barh(y_pos[fam_idx], share, bar_height, left=left_offset,
#                    color=colors[attr_cat], edgecolor='white', linewidth=0.5,
#                    label=attr_cat if idx == 0 and fam_idx == 0 else None)
            
#             # Add percentage annotation if share is > 5%
#             if share > 0.05:
#                 ax.text(left_offset + share/2, y_pos[fam_idx], f'{share:.0%}',
#                        ha='center', va='center', fontsize=7, fontweight='bold',
#                        color='white' if attr_cat in ['0', '≥2'] else 'black')
            
#             left_offset += share
    
#     # Customize subplot
#     ax.set_yticks(y_pos)
#     ax.set_yticklabels([family_labels[fam] for fam in families], fontsize=8)
#     ax.set_xlim(0, 1.0)
#     ax.xaxis.set_major_formatter(plt.FuncFormatter(lambda x, _: f'{x:.0%}'))
#     ax.set_title(country, fontsize=9, fontweight='bold', pad=3)
#     ax.grid(axis='x', alpha=0.2, linestyle='--')
    
#     # Add x-label only for bottom row
#     if idx >= (n_rows - 1) * n_cols:
#         ax.set_xlabel('Share', fontsize=8)

# # Remove extra subplots
# for idx in range(n_countries, len(axes)):
#     fig.delaxes(axes[idx])

# # Add overall legend
# if n_countries > 0:
#     from matplotlib.patches import Patch
#     legend_elements = [Patch(facecolor=colors[cat], label=cat) for cat in attr_categories]
#     fig.legend(handles=legend_elements, title='Attributes', loc='upper center', 
#               ncol=3, frameon=True, fontsize=8, bbox_to_anchor=(0.5, 1.0))

# plt.tight_layout(rect=[0, 0, 1, 0.98])
# plt.show()

In [None]:
#| label: fig-prevalence_by_family_w_mainstream
#| output: true
#| fig-cap: "Prevalence of economic and non-economic group attributes in social group mentions in mainstream party manifestos (Conservative and Social Democratic) compared to PRR and Green parties. Bars show share of mentions containing each attribute, with 95% confidence intervals."

all_fam_order_alt = ['Populist Radical-Right', 'Conservative', 'Social Democratic', 'Green']

# Filter to 4 countries with mainstream party data
countries_4 = ['SWE', 'DEU', 'GBR'] # TODO: consider adding USA

df_4countries = df[df['country_iso3c'].isin(countries_4)].copy()

df_4countries['party_family_label'] = df_4countries['party_family'].apply(normalize_family)
df_4countries['economic'] = df_4countries[econ_attrs].any(axis=1).astype(int)
df_4countries['non-economic'] = df_4countries[nonecon_attrs].any(axis=1).astype(int)
df_4countries['universal'] = (~df_4countries[label_cols].any(axis=1)).astype(int)

dim_all_fam = compute_prevalence(
    df_4countries, 
    ['economic', 'non-economic', 'universal'], 
    group_by='party_family_label'
)

df_4countries = df_any[df_any['country_iso3c'].isin(countries_4)].copy()
df_4countries['party_family_label'] = df_4countries['party_family'].apply(normalize_family)

# Compute prevalence by party family for all families
econ_all_fam = compute_prevalence(df_4countries, econ_attrs, group_by='party_family_label')
nonecon_all_fam = compute_prevalence(df_4countries, nonecon_attrs, group_by='party_family_label')

# Map to readable names
econ_all_fam['attribute'] = econ_all_fam['attribute'].map(attribute_category_names_map)
nonecon_all_fam['attribute'] = nonecon_all_fam['attribute'].map(attribute_category_names_map)

# Verify CI bounds
assert econ_all_fam.query("prevalence < ci_low | prevalence > ci_high").empty
assert nonecon_all_fam.query("prevalence < ci_low | prevalence > ci_high").empty

# Prepare subplot layout with dynamic heights
n_econ = econ_all_fam['attribute'].nunique()
n_nonecon = nonecon_all_fam['attribute'].nunique()
r_ = 0.5
fig_height = r_ * 3 + r_ * n_econ + r_ * n_nonecon + 2.5
fig, axes = plt.subplots(
    nrows=3,
    ncols=1,
    figsize=(6, fig_height*0.97),
    gridspec_kw={'height_ratios': [3, n_econ, n_nonecon]},
    sharex=True,
    dpi=200
)

# Plot using the generalized function
plot_prevalence_bars(
    dim_all_fam,
    axes[0],
    title='Attribute dimensions',
    attribute_order=['universal', 'economic', 'non-economic'],
    hue_col='party_family_label',
    hue_order=all_fam_order_alt,
    palette=all_fam_palette,
    xlim=(0, .6)
)

plot_prevalence_bars(
    econ_all_fam,
    axes[1],
    'Economic attributes',
    hue_col='party_family_label',
    hue_order=all_fam_order_alt,
    palette=all_fam_palette,
    xlim=(0, .6)
)

plot_prevalence_bars(
    nonecon_all_fam,
    axes[2],
    'Non-economic attributes',
    hue_col='party_family_label',
    hue_order=all_fam_order_alt,
    palette=all_fam_palette,
    xlim=(0, .6)
)

# Add shared legend below the lower plot
handles = [plt.Rectangle((0,0),1,1, fc=all_fam_palette[fam]) for fam in all_fam_order]
fig.legend(handles, all_fam_order, loc='lower center', bbox_to_anchor=(0.65, -0.06), ncol=2, frameon=False)

plt.tight_layout()
plt.show()

In [None]:
fam_ts = df_any[df_any['party_family'].isin(['prrp', 'green'])].copy()
fam_ts['party_family_label'] = fam_ts['party_family'].apply(normalize_family)
fam_ts['decade'] = (fam_ts['year'] // 10)*10

plot_data = compute_prevalence(fam_ts, attribute_category_names_map, group_by=['party_family_label', 'decade'])
plot_data['attribute_label'] = plot_data['attribute'].map(attribute_category_names_map)

upper_y = plot_data.groupby('attribute')['ci_high'].max()
upper_y = round(upper_y * 1.075, 2)
upper_y = dict(upper_y)

In [None]:
#| label: fig-prevalence_trends_economic
#| output: true
#| fig-cap: "Temporal trends in the prevalence of economic attributes in social group mentions by PRR and Green parties across decades. Each panel shows one economic attribute category, with error bars representing 95% confidence intervals. Lines show the share of mentions containing each attribute over time."

these = econ_attrs

heights = [upper_y[a] for a in these]
heights /= sum(heights)

fig, axes = plt.subplots(len(these), 1, figsize=(6, 1.5 * len(these)), sharex=False, height_ratios=heights, gridspec_kw={'hspace': 4/3})
for attr_to_plot, ax in zip(these, axes):

    attr_ts = plot_data[plot_data['attribute'] == attr_to_plot].copy()

    for fam, sub in attr_ts.groupby('party_family_label'):
        if sub.empty:
            continue
        sub.sort_values('decade', inplace=True)
        yerr = np.vstack([sub['prevalence'] - sub['ci_low'], sub['ci_high'] - sub['prevalence']])
        ax.errorbar(
            sub['decade'],
            sub['prevalence'],
            yerr=yerr,
            fmt='o-',
            color=fam_col_palette[fam],
            label=fam,
            capsize=0,
            linewidth=1.5,
            markersize=5,
        )
    ax.set_ylim(0, round(upper_y[attr_to_plot] * 1.15, 2))
    ax.set_title(attribute_category_names_map[attr_to_plot], fontweight='bold')
    ax.grid(True, axis='y', alpha=0.3)

    ax.set_ylabel("Prevalence")
    ax.set_xlabel('Decade')


# add a manual legend below the last plot (using only dot, not line)
handles = [plt.Rectangle((0, 0), 1, 1, color=fam_col_palette[fam], label=fam) for fam in fam_col_palette]
fig.legend(handles, fam_col_palette.keys(), loc='lower center', bbox_to_anchor=(0.5, -0.02), ncol=2, frameon=False)

fig.tight_layout()

In [None]:
#| label: fig-prevalence_trends_noneconomic
#| output: true
#| fig-cap: "Temporal trends in the prevalence of economic attributes in social group mentions by PRR and Green parties across decades. Each panel shows one economic attribute category, with error bars representing 95% confidence intervals. Lines show the share of mentions containing each attribute over time."

these = [a for a in nonecon_attrs if a not in ("noneconomic__ethnicity", "noneconomic__religion", "noneconomic__place_location")]

heights = [upper_y[a] for a in these]
heights /= sum(heights)

fig, axes = plt.subplots(len(these), 1, figsize=(6, 1.5 * len(these)), sharex=False, height_ratios=heights, gridspec_kw={'hspace': 4/3})
for attr_to_plot, ax in zip(these, axes):

    attr_ts = plot_data[plot_data['attribute'] == attr_to_plot].copy()


    for fam, sub in attr_ts.groupby('party_family_label'):
        sub = sub.query("prevalence >= 0.01")
        if sub.empty:
            continue
        sub.sort_values('decade', inplace=True)
        yerr = np.vstack([sub['prevalence'] - sub['ci_low'], sub['ci_high'] - sub['prevalence']])
        ax.errorbar(
            sub['decade'],
            sub['prevalence'],
            yerr=yerr,
            fmt='o-',
            color=fam_col_palette[fam],
            label=fam,
            capsize=0,
            linewidth=1.5,
            markersize=5,
        )
    ax.set_ylim(0, round(upper_y[attr_to_plot] * 1.15, 2))
    ax.set_title(attribute_category_names_map[attr_to_plot], fontweight='bold')
    ax.grid(True, axis='y', alpha=0.3)

    ax.set_ylabel("Prevalence")
ax.set_xlabel('Decade')


# add a manual legend below the last plot (using only dot, not line)
handles = [plt.Rectangle((0, 0), 1, 1, color=fam_col_palette[fam], label=fam) for fam in fam_col_palette]
fig.legend(handles, fam_col_palette.keys(), loc='lower center', bbox_to_anchor=(0.5, 0.04), ncol=2, frameon=False)

fig.tight_layout()

### Comparing co-occurrence patterns {#sec-attribute_cooc}

We argue that intersectionality in parties' group mentions is an interesting facet of their group focus strategies.
In this context, the question arises how to compare interesectionality patterns between groups.
In our analysis, a key question along this line is whether PRR vs. Green parties combine attributes differently?

There are multiple ways to quantify and compare attribute co-occurrence patterns.
Each approach has its strengths and weaknesses.
Below, we discuss four possible approaches and provide recommendations for their use.

- **Comparing conditional probabilities**:
    We can compute $\Pr(\text{attribute B} \mid \text{attribute A})$ by party family and compare the values.
    Conditional probabilities have the advantage that they are very interpretable, allowing statements like "When Populist Radical-Right mentions class, 12% also mention gender."
    They thus directly answers substantive questions about co-occurrence patterns.
    Further, they do not suffer from base rate sensitivity issues like the PMI (see below).
    Subtracting the values for Green parties from thjose for PRR parties, for example, we obtain an indicator that is negative if PRR parties tend to combine the given attributs more frequently.
    Conditional probability differences can thus be compared across parties through simple subtraction, and the approach works well even with sparse data.
    
    The downside is that the measure is asymmetric, requiring careful interpretation. 
    Further, it does not account for statistical significance of observed differences.
    
    We therefore use it solely for _descriptive_ comparison of party families' attribute combination strategies.

- **Comparing statistical significance**:
    We can apply $\chi^2$ or Fisher's exact tests for each attribute pair to determine whether co-occurrence patterns differ significantly between party families.
    These tests provide formal hypothesis testing and control for sampling variability. 
    Effect size measures, such as Cramér's $V$, in turn, allow assessing practical significance beyond mere statistical significance.
    <!-- , and Fisher's exact test works reliably even with small cell counts. -->
    
    However, <!-- multiple comparison problems arise when testing many pairs simultaneously, requiring correction procedures like Bonferroni adjustment. The --> tests are sensitive to sample size, meaning that with large N, nearly everything becomes statistically significant. 
    Further, binary yes/no decisions do not capture the magnitude of differences.
    
    We therefore use significance testing for determining which attribute pair differences are statistically robust.

- **Comparing normalized Pointwise Mutual Information (nPMI)**:
    We can compute nPMI values by party family, which compare observed to expected co-occurrence under statistical independence.
    nPMI identifies unexpected patterns in both directions (positive associations where attributes co-occur more than expected, and negative associations where they co-occur less than expected). 
    Being normalized to a [-1, +1] scale, it allows comparing different attribute pairs.
    This makes the nPMI metric useful for exploratory analysis.
    
    However, the measure is hard to interpret substantively in terms of party strategy. 
    It is sensitive to base rates, and negative values tend to dominate in sparse data (as we observed in our analysis). 
    Additionally, differences between parties can be small even when the underlying patterns differ substantially.
    
    We therefore do not rely on nPMI analysis. <!-- for identifying which attribute pairs warrant further investigation.--> 

<!--
We therefore adopt the following workflow:

1. **Describe** combination patterns using conditional probabilities
   - P(B|A) tables by party
   - Visualize top differences

2. **Test** robustness using significance tests
   - $\chi^2$ for frequent pairs
   - Fisher's exact for rare pairs
   - Bonferroni correction for multiple comparisons

3. **Explore** unexpected patterns using nPMI
   - Identify strong positive associations (intersectional frames)
   - Identify strong negative associations (strategic compartmentalization)

4. **Interpret** in substantive context
   - Link to party ideology
   - Consider manifesto genre constraints
   - Examine qualitative examples
-->

### Statistical significance of co-occurrence differnces

We report differences between PRR and Green parties' attribute co-mentioning patterns in @fig-attribute_cpr_differences as a way to understand how these two party families' group focus strategies differ through the lense of intersectionality.
Below, we report the results of $\chi^2$ tests that assess whether observed differences in parties co-mentioning patterns are statistically significant.
Further, we rely on Cramér's $V$ to the practical significance of these differences -- if any.
Cramér's $V$ measures association strength between two categorical variables, ranging from 0 to 1, where 0 indicats no association (complete independence) and 1 perfect association (complete dependence).

In [None]:
nan_if_none = lambda x: np.nan if x is None else x

def test_cooccurrence_difference(df_group_1, df_group_2, attr_a, attr_b):
    """
    Test if co-occurrence of attr_a and attr_b differs between two party groups.
    Returns chi2 statistic, p-value, and effect size (Cramér's V)
    """
    # Create 2x2 contingency table:
    # Row 1: Populist Radical-Right [both present, not both present]
    # Row 2: Green [both present, not both present]
    both_p1 = int((df_group_1[attr_a] & df_group_1[attr_b]).sum())
    not_both_p1 = int(len(df_group_1) - both_p1)
    both_p2 = int((df_group_2[attr_a] & df_group_2[attr_b]).sum())
    not_both_p2 = int(len(df_group_2) - both_p2)
    
    contingency = np.array([
        [both_p1, not_both_p1],
        [both_p2, not_both_p2]
    ])
    
    # Use Fisher's exact test for small cell counts
    if np.any(contingency < 5):
        _, p = fisher_exact(contingency)
        chi2 = np.nan
    else:
        try:
            chi2, p, dof, expected = chi2_contingency(contingency)
        except ValueError:
            # Expected frequencies too small
            _, p = fisher_exact(contingency)
            chi2 = np.nan
    
    # Cramér's V effect size
    n = contingency.sum()
    if not np.isnan(chi2):
        min_dim = min(contingency.shape) - 1
        cramers_v = np.sqrt(chi2 / (n * min_dim)) if n > 0 and min_dim > 0 else 0
    else:
        cramers_v = np.nan
    
    return chi2, p, cramers_v, contingency

# gather test results for _all_ attribute combinations in a data frame
chi2_test_results = []

# create attribute combinations (ignoring ordering)
attr_combinations = permutations(econ_attrs + nonecon_attrs, 2)
for attr_a, attr_b in attr_combinations:
    if attr_a == attr_b:
        continue  # skip same attribute pairs
    chi2, p, v, cont = test_cooccurrence_difference(
        df_prrp, # df_prrp[df_prrp[label_cols].sum(axis=1)>1], 
        df_green, # df_green[df_green[label_cols].sum(axis=1)>1], 
        attr_a, attr_b
    )
    
    prrp_rate = cont[0,0] / cont[0].sum() * 100
    green_rate = cont[1,0] / cont[1].sum() * 100
    
    chi2_test_results.append({
        'attribute_a': attribute_category_names_map[attr_a],
        'attribute_b': attribute_category_names_map[attr_b],
        'chi2': nan_if_none(chi2),
        'p_value': nan_if_none(p),
        'cramers_v': nan_if_none(v),
        'prrp_rate': nan_if_none(prrp_rate),
        'green_rate': nan_if_none(green_rate),
        'significant': p < 0.05
    })

chi2_results_df = pd.DataFrame(chi2_test_results)

In [None]:
#| label: fig-attribute_association_difference_significance
#| output: true
#| fig-cap: "Substantive significance of differences in attribute co-occurrence patterns between PRR and Green parties. Heatmap cells show Cramér's V effect size for attribute pairs where co-occurrence significantly differs between the two party families (Chi-square test, p < 0.05). *Note:* Values on the diagonal are masked."

chi2_results_df['value'] = chi2_results_df['cramers_v']
chi2_results_df.loc[~chi2_results_df['significant'], 'value'] = np.nan
heatmap_data = chi2_results_df.pivot_table(
    index='attribute_a',
    columns='attribute_b',
    values='value',
    fill_value=np.nan,
    dropna=False,
    observed=False,
    sort=False
)


fig, axes = plot_heatmap(
    x=heatmap_data,
    panel_groups=(econ_attr_names, nonecon_attr_names),
    mask_diagonal=True,
    cmap='YlOrRd',
    clims=(0, 0.10),
    clegend_title="Cramér's V"
)
axes[0].set_ylabel("economic\n", fontweight='bold')
axes[1].set_ylabel("non-economic\n", fontweight='bold')
plt.show()

@fig-attribute_association_difference_significance shows that Cramér's $V$ estimates for attribute combinations with significant differences in party families' co-occurrence patterns (according to $\chi^2$-tests) range from 0.006 to 0.054.
Values in this range are commonly interpreted as very weak (Cohen, 1988).
In particular, this means that even the "strongest" difference (_occupation/profession_	$\times$ _nationality_) explains less than 0.2% of variance. 

This understcores that statistical significance does not equate practical significance.
While the $\chi^2$-tests found the differences for the examined attribute combinations to be statistically significant (p < 0.05), the actual strength of association is very weak.
However, it is important to recall that most mentions in our data have no or only one attribute.
This makes our attribute co-occurrence data very sparse.
Therefore, even small $V$ values can represent meaningful political choices.
Yet, intersectionality patterns _alone_ do not produce strong separation between party families.