In [2]:
import pandas as pd
from src import config
from src.votes.config import RESULT_CSV_FOLDER
from src.enums import VoteResultEnum


def vote_counts_to_result(votes: pd.Series) -> str:
    """Convert vote counts to a result string."""
    return votes[
        [
            VoteResultEnum.ANNAHME.value,
            VoteResultEnum.ABLEHNUNG.value,
            VoteResultEnum.ENTHALTUNG.value,
        ]
    ].idxmax()


def flip_vote_result(vote_result: str) -> str:
    if vote_result == VoteResultEnum.ANNAHME.value:
        return VoteResultEnum.ABLEHNUNG.value
    elif vote_result == VoteResultEnum.ABLEHNUNG.value:
        return VoteResultEnum.ANNAHME.value
    if vote_result == VoteResultEnum.ENTHALTUNG.value:
        return VoteResultEnum.ENTHALTUNG.value
    raise ValueError(
        f"Invalid vote result: {vote_result}. Expected one of {VoteResultEnum.ANNAHME.value}, "
        f"{VoteResultEnum.ABLEHNUNG.value}, {VoteResultEnum.ENTHALTUNG.value}."
    )


def build_party_df(party: str) -> pd.DataFrame:
    """Build a DataFrame for a specific party."""
    df = pd.read_csv(f"{RESULT_CSV_FOLDER}/{party}.csv")
    df["ground_truth"] = df.apply(vote_counts_to_result, axis=1)
    df["party"] = party
    return df[["vote_id", "ground_truth", "party"]]


party_results = [build_party_df(party) for party in config.PARTIES]
party_results = pd.concat(party_results, ignore_index=True)
all_predictions = pd.read_parquet("data/predictions.parquet")
all_predictions = all_predictions.drop(columns=["content"]).rename(
    columns={
        "drucksache_title": "title",
    }
)
with_ground_truth = all_predictions.merge(
    party_results,
    on=["vote_id", "party"],
    how="left",
)
with_ground_truth = with_ground_truth[with_ground_truth["ground_truth"].notna()]

with_ground_truth.loc[
    (with_ground_truth["beschlussempfehlung"] == VoteResultEnum.ABLEHNUNG.value),
    "ground_truth",
] = with_ground_truth.loc[
    (with_ground_truth["beschlussempfehlung"] == VoteResultEnum.ABLEHNUNG.value),
    "ground_truth",
].apply(flip_vote_result)

In [3]:

from src.feature_engineering.categories import get_category_column


get_category_column(with_ground_truth["summary_embedding"].iloc[:10])

[32m2025-05-29 11:38:36.294[0m | [1mINFO    [0m | [36msrc.feature_engineering.categories[0m:[36mget_category_column[0m:[36m32[0m - [1mCalculating closest categories through embeddings[0m
[32m2025-05-29 11:38:36.295[0m | [1mINFO    [0m | [36msrc.feature_engineering.categories[0m:[36mget_embeddings[0m:[36m11[0m - [1mLoading existing categories embeddings...[0m


0    Verteidigung & Sicherheit - Militär, Verteidig...
1    Verteidigung & Sicherheit - Militär, Verteidig...
2    Verteidigung & Sicherheit - Militär, Verteidig...
3    Inneres & Migration - Innere Sicherheit, öffen...
4    Justiz & Verbraucherschutz - Rechtsprechung, G...
5    Inneres & Migration - Innere Sicherheit, öffen...
6    Inneres & Migration - Innere Sicherheit, öffen...
7    Inneres & Migration - Innere Sicherheit, öffen...
8    Inneres & Migration - Innere Sicherheit, öffen...
9    Gesundheit - Gesundheitssystem, Krankenversich...
Name: summary_embedding, dtype: object

In [1]:
from src.feature_engineering.mirror_beschlussempfehlung import prepare_final_dataset
from src.feature_engineering.categories import get_category_column


dataset = prepare_final_dataset()
dataset["category"] = get_category_column(dataset["summary_embedding"])

[32m2025-05-29 11:42:17.779[0m | [1mINFO    [0m | [36msrc.feature_engineering.categories[0m:[36mget_category_column[0m:[36m32[0m - [1mCalculating closest categories through embeddings[0m
[32m2025-05-29 11:42:17.780[0m | [1mINFO    [0m | [36msrc.feature_engineering.categories[0m:[36mget_embeddings[0m:[36m11[0m - [1mLoading existing categories embeddings...[0m


In [14]:
party_results

Unnamed: 0,vote_id,Annahme,Ablehnung,Enthaltung,ground_truth
0,20250318_3,0,73,0,Ablehnung
1,20250318_2,0,73,0,Ablehnung
2,20250318_1,0,73,0,Ablehnung
3,20250131_1,75,0,0,Annahme
4,20250130_4,0,66,0,Ablehnung
...,...,...,...,...,...
3654,20121025_4,0,220,0,Ablehnung
3655,20121025_3,0,223,0,Ablehnung
3656,20121025_2,222,0,0,Annahme
3657,20121025,224,0,0,Annahme
