In [None]:
# Standard library imports
import json
from pathlib import Path
import warnings

# Data manipulation
import pandas as pd
import numpy as np
import yaml

In [None]:
# Define base directory (paper/secs -> go up two levels to project root)
base_path = Path.cwd().parent.parent
data_path = base_path / 'data'
labeled_path = data_path / 'labeled'
manifestos_path = data_path / 'manifestos'
annotations_path = data_path / 'annotations' / 'group_mention_categorization'

In [None]:
# # Load attribute definitions from JSON
# attr_defs_path = annotations_path / 'attribute_definitions.json'
# with open(base_path, 'r') as f:
#     attribute_definitions = json.load(f)

# # Create a reference dataframe for attribute metadata
# attr_metadata = []
# for attr_name, description in attribute_definitions.items():
#     # Categorize into economic vs. non-economic based on position in JSON
#     # (First 6 are economic, rest are non-economic based on the file we saw)
#     attr_metadata.append({
#         'attribute': attr_name,
#         'description': description
#     })

# attr_df = pd.DataFrame(attr_metadata)

In [None]:
# Attribute definitions
attrs_path = annotations_path / 'group_attributes_v2.yaml'

# Create list of economic attribute names for later use
with open(attrs_path, 'r') as f:
    attrs_dict = yaml.safe_load(f)['social_group']

econ_attrs = list(attrs_dict['economic_attributes']['attributes'].keys())
nonecon_attrs = list(attrs_dict['non_economic_attributes']['attributes'].keys())
label_cols = econ_attrs + nonecon_attrs

## Load datasets

#### Load predicted attribute classifications

In [None]:
# File paths for predicted attribute classifications
econ_attrs_path = labeled_path / 'manifesto_sentences_predicted_social_group_mentions_with_economic_attributes_classifications.tsv'
df_econ = pd.read_csv(econ_attrs_path, sep='\t')

nonecon_attrs_path = labeled_path / 'manifesto_sentences_predicted_social_group_mentions_with_noneconomic_attributes_classifications.tsv'
df_nonecon = pd.read_csv(nonecon_attrs_path, sep='\t')

In [None]:
# Perform the join
key_cols = ["country_iso3c", "sentence_id", "sentence_text", "span_nr", "group_type", "text"]

df_mentions = pd.merge(
    df_econ,
    df_nonecon,
    on=key_cols,
    how='inner',
    validate='1:1'
)

In [None]:
max_spans = df_mentions['span_nr'].max()
df_mentions['mention_id'] = df_mentions['sentence_id'].astype(str) + '-' + df_mentions['span_nr'].astype(str).str.zfill(len(str(max_spans)))
df_mentions['manifesto_id'] = df_mentions['mention_id'].str.extract(r'^(\d+_\d+)-.+', expand=False)
df_mentions.loc[:, "party_id"] = df_mentions['manifesto_id'].str.split('_').str[0].astype(int) 
df_mentions.loc[:, "date"] = df_mentions['manifesto_id'].str.split('_').str[1].astype(int) 

### Load party-election population data


In [None]:
# /Users/hlicht/Dropbox/papers/group_appeals_galtan_parties/data/manifestos/cases/dataset_population.tsv
fp = manifestos_path / 'cases' / 'dataset_population.tsv'
df_population = pd.read_csv(fp, sep='\t')
df_population.head()
df_population['date'] = df_population['manifesto_id'].str.split('_').str[1].astype(int)

In [None]:
country_codes = df_population.groupby('country_iso3c').first().reset_index()[['country_iso3c', 'country_name', "party_id"]]
country_codes["country_code"] = country_codes["party_id"].astype(str).apply(lambda x: x[:2] if len(x)==5 else x[:1]).astype(int)

In [None]:
# def lookup_country(id: str) -> int:
#     """Lookup country name from party_id."""
#     if pd.isna(id):
#         return None
#     id = str(id)
#     code = int(id[:2]) if len(id) == 5 else int(id[:1])
#     match = country_codes[country_codes['country_code'] == code]
#     if not match.empty:
#         return match.iloc[0]['country_iso3c']
#     return None

### Load party mapping information

In [None]:
# Party mapping file
party_mapping_path = manifestos_path / 'party_mapping_updated.xlsx'

# Load party mapping from Excel file
df_party = pd.read_excel(party_mapping_path)
df_party.head(3)

In [None]:
assert df_party[["party_id", "date"]].isna().any(axis=1).sum() == 0

In [None]:
df_party.loc[:, "manifesto_id"] = df_party[["party_id", "date"]].astype(str).apply(lambda x: '_'.join(x.str.strip()), axis=1)

# create categorical indicator from one-hot party category:
# create categorical indicator from one-hot party category:
df_party['party_family'] = df_party[['green', 'prrp', 'sd', 'con']].apply(lambda row: 'green' if row['green'] == 1 else ('prrp' if row['prrp'] == 1 else ('sd' if row['sd'] == 1 else ('con' if row['con'] == 1 else 'other'))), axis=1)

df_party['party_family'] = pd.Categorical(df_party['party_family'], categories=['con', 'sd', 'prrp', 'green', 'other'], ordered=True)


In [None]:
# check uniqueness
tmp = df_party.groupby(['manifesto_id', 'party_id', 'date']).size()
tmp[tmp>1].reset_index().merge(df_party, on=['manifesto_id', 'party_id', 'date'], how='left').sort_values(by=['manifesto_id', 'party_id', 'date'])

In [None]:
if "date_new" in df_party.columns:
    df_party.drop(columns=["date_new"], inplace=True)
df_party = df_party.drop_duplicates()
# check again
tmp = df_party.groupby(['manifesto_id', 'party_id', 'date']).size()
tmp[tmp>1]
# ✅

In [None]:
df_mentions[~df_mentions.manifesto_id.isin(df_party['manifesto_id'].tolist())]
# ✅

In [None]:
df_mentions_manifestos = df_mentions[['manifesto_id']].drop_duplicates()

In [None]:
tmp = df_mentions_manifestos.merge(df_party[["party_id", "date", "manifesto_id"]], on="manifesto_id", how='outer', indicator=True)
tmp._merge.value_counts()
# TODO: go to notebook ../apx/apx_case_descriptives.ipynb and verify completeness

In [None]:
tmp = df_mentions_manifestos.merge(df_population[["party_id", "date", "manifesto_id"]], on="manifesto_id", how='outer', indicator=True)
tmp._merge.value_counts()
# TODO: go to notebook ../apx/apx_case_descriptives.ipynb and verify completeness

In [None]:
# NOTE: do we at least cover all parties in the population dataset?
tmp = df_mentions[["party_id"]].drop_duplicates().merge(df_population[["party_id", "party_name"]].drop_duplicates(), on="party_id", how='outer', indicator=True)
print(dict(tmp._merge.value_counts()))
# No, but these are the center-left and -right mainstream parties from Sweden, Germany, UK, and USA added after initial case selection (covered in `df_party`)
tmp.query('_merge == "left_only"').merge(df_party[["party_id", "name"]].drop_duplicates(), on="party_id", how='left')

In [None]:
df_party = df_party.merge(df_population[["party_id", "party_name", "date"]].drop_duplicates(), on=["party_id", "date"], how='left')

In [None]:
df_party.loc[df_party['name']=="Socialdemokratiska Arbetareparti", "party_name"] = "Social Democratic Labour Party"
df_party.loc[df_party['name']=="Hogerpartiet", "party_name"] = "Moderate Coalition Party"
df_party.loc[df_party['name']=="Sozialdemokratische Partei Deutschlands", "party_name"] = "Social Democratic Party of Germany"
df_party.loc[df_party['name']=="Christlich-Demokratische Union", "party_name"] = "Christian Democratic Union/Christian Social Union"
df_party.loc[df_party['party_name'].isnull(), ].drop_duplicates(subset=['party_id'])
df_party.loc[df_party['party_name'].isnull(), "party_name"] = df_party.loc[df_party['party_name'].isnull(), "name"]

## Join datasets

Now we'll join the economic and non-economic attribute classifications, and then add the party metadata.

In [None]:
# Perform the join
df = pd.merge(
    df_mentions,
    df_party,
    on=['manifesto_id', 'party_id', 'date'],
    how='left',
    validate='m:1'
)

In [None]:
# # Save the processed dataset for future use
# output_path = base_path / 'mentions_with_attributes_and_party_metadata.parquet'
# df_full.to_parquet(output_path, index=False)
# print(f"Saved processed dataset to: {output_path}")
# print(f"File size: {output_path.stat().st_size / 1024 / 1024:.2f} MB")

In [None]:
df['year'] = df['date'].astype(str).str[:4].astype(int)
df['month'] = df['date'].astype(str).str[4:6].astype(int)

In [None]:
econ_attrs = [c for c in df_mentions.columns if c.startswith('economic__')]
nonecon_attrs = [c for c in df_mentions.columns if c.startswith('noneconomic__')]

In [None]:
key_cols= [
    'country_iso3c',
    'party_id', 'party_name', 'name', 'party_family',
    'date', 'year', 'month',
    'manifesto_id',
    'sentence_id', 'sentence_text',
    'mention_id',  'span_nr', 'text', # NOTE: omitted 'group_type' because all are 'social_group'
    *econ_attrs,
    *nonecon_attrs,
]
set(df.columns.to_list()) - set(key_cols)

In [None]:
df = df[key_cols].rename(columns={'name': 'party_name_original'})

In [None]:
df[["party_name", "party_name_original"]].drop_duplicates().sort_values(by=["party_name", "party_name_original"])

## Save

In [None]:
fp = labeled_path / 'labeled_mentions_with__party_metadata.pkl'
df.to_pickle(fp)

In [None]:
df.columns