In [5]:
import pandas as pd
import plotly.express as px
from src import config
from tqdm import tqdm
from src.utils.llm.openai_client import get_proposer
predictions = pd.read_parquet("output/predictions.parquet")

In [11]:
from src.feature_engineering.categories import get_category_column
from src.feature_engineering.legislature_period import get_legislature_period_metadata
from src.feature_engineering.mirror_beschlussempfehlung import prepare_final_dataset


dataset = prepare_final_dataset()
dataset["category"] = get_category_column(dataset["summary_embedding"])

dataset["metadata"] = dataset.apply(
    lambda row: get_legislature_period_metadata(row["party"], row["date"]), axis=1
)
dataset[[
    "is_governing",
    "bundestag",
]] = dataset["metadata"].apply(pd.Series)
dataset.drop(columns=["metadata"], inplace=True)
dataset["vote_correct"] = (dataset["prediction"] == dataset["ground_truth"])

[32m2025-05-31 12:21:00.906[0m | [1mINFO    [0m | [36msrc.feature_engineering.categories[0m:[36mget_category_column[0m:[36m36[0m - [1mCalculating closest categories through embeddings[0m
[32m2025-05-31 12:21:00.907[0m | [1mINFO    [0m | [36msrc.feature_engineering.categories[0m:[36mget_embeddings[0m:[36m13[0m - [1mLoading existing categories embeddings...[0m


In [45]:
from src.votes.build import is_own_proposal


votes = pd.read_parquet("output/votes.parquet")

votes


Unnamed: 0,vote_id,type,drucksache_title,drucksache_id,beschlussempfehlung,content,summary,summary_embedding,date,proposers
0,20250318_3,Gesetzentwurf,Gesetzentwurf der Fraktionen der SPD und CDU/C...,20/15096,,A. Problem Fundamentale Veränderungen der Sich...,Abgestimmt wird über die Einrichtung einer ver...,"[0.009540573693811893, 0.03497336432337761, 0....",2025-03-18,"[SPD, Union]"
1,20250318_2,Änderungsantrag,Änderungsantrag der Fraktion der FDP zu der zw...,20/15120,,Deutscher Bundestag Drucksache 20/15120 20. Wa...,"Der Bundestag soll beschließen, das Grundgeset...","[0.011175020597875118, 0.03776725009083748, 0....",2025-03-18,[FDP]
2,20250318_1,Gesetzentwurf,Gesetzentwurf der Fraktion der FDP Entwurf ein...,20/15099,,A. Problem Der russische Angriffskrieg gegen d...,Im Bundestag wird über die Erweiterung des Son...,"[0.012960884720087051, 0.037686433643102646, 0...",2025-03-18,[FDP]
3,20250131_1,Gesetzentwurf,Gesetzentwurf der Fraktion der CDU/CSU Entwurf...,20/12804,,A. Problem § 1 Absatz 1 Satz 1 bis 3 des Aufen...,Das Gesetz soll das Ziel der Begrenzung der Zu...,"[0.04290461167693138, 0.07238075137138367, 0.0...",2025-01-31,[Union]
4,20241219_1,Gesetzentwurf,"Gesetzentwurf der Fraktionen SPD, CDU/CSU, BÜN...",20/12977,,A. Problem und Ziel Das Grundgesetz hat sich f...,Im Bundestag wird über einen Gesetzentwurf abg...,"[-0.013081829994916916, 0.0215305108577013, 0....",2024-12-19,"[SPD, Union, DIE_GRÜNEN, FDP]"
...,...,...,...,...,...,...,...,...,...,...
658,20130131,Antrag,Antrag der Bundesregierung Fortsetzung der Bet...,17/11685,Annahme,Deutscher Bundestag Drucksache 17/11685 17. Wa...,Der Bundestag stimmt der Fortsetzung des Einsa...,"[-0.018130291253328323, 0.036971572786569595, ...",2013-01-31,[Bundesregierung]
659,20121214_1,Antrag,Antrag der Bundesregierung Entsendung bewaffne...,17/11783,Annahme,Deutscher Bundestag Drucksache 17/11783 17. Wa...,Der Bundestag stimmt der Entsendung bewaffnete...,"[-0.03251594305038452, 0.022284308448433876, 0...",2012-12-14,[Bundesregierung]
660,20121213_4,Antrag,Antrag der Bundesregierung Fortsetzung des Ein...,17/11466,Annahme,Deutscher Bundestag Drucksache 17/11466 17. Wa...,Der Bundestag stimmt der Fortsetzung des Einsa...,"[-0.00255092466250062, 0.07303918898105621, 0....",2012-12-13,[Bundesregierung]
661,20121213_2,Antrag,"Antrag der Abgeordneten Johanna Voß, Ulla Lötz...",17/11328,Ablehnung,Deutscher Bundestag Drucksache 17/11328 17. Wa...,Abgestimmt wird über ein Gesetz zur vollständi...,"[-0.004911317024379969, 0.06889131665229797, 0...",2012-12-13,[DIE_LINKE]


In [1]:
from src.run_preprocessing import run_preprocessing

run_preprocessing()

[32m2025-05-31 14:20:29.937[0m | [1mINFO    [0m | [36msrc.votes.gather[0m:[36mscrape_urls[0m:[36m56[0m - [1mVote metadata already exists. Skipping scraping.[0m
Downloading and summarizing manifestos:   0%|          | 0/29 [00:00<?, ?manifesto/s][32m2025-05-31 14:20:30.005[0m | [1mINFO    [0m | [36msrc.manifestos.download[0m:[36mdownload_manifestos[0m:[36m40[0m - [1mManifesto for FDP 2009 already downloaded. Skipping.[0m
[32m2025-05-31 14:20:30.006[0m | [1mINFO    [0m | [36msrc.manifestos.download[0m:[36mdownload_manifestos[0m:[36m40[0m - [1mManifesto for FDP 2013 already downloaded. Skipping.[0m
[32m2025-05-31 14:20:30.007[0m | [1mINFO    [0m | [36msrc.manifestos.download[0m:[36mdownload_manifestos[0m:[36m40[0m - [1mManifesto for FDP 2017 already downloaded. Skipping.[0m
[32m2025-05-31 14:20:30.008[0m | [1mINFO    [0m | [36msrc.manifestos.download[0m:[36mdownload_manifestos[0m:[36m40[0m - [1mManifesto for FDP 2021 already downl



Building vote entrypoints:  51%|█████     | 358/706 [06:15<05:29,  1.05vote/s][32m2025-05-31 14:26:45.214[0m | [31m[1mERROR   [0m | [36msrc.votes.build[0m:[36mbuild_entrypoints[0m:[36m103[0m - [31m[1mError building vote 358: Title not found in 358[0m
Building vote entrypoints:  58%|█████▊    | 407/706 [07:00<05:30,  1.10s/vote][32m2025-05-31 14:27:30.238[0m | [31m[1mERROR   [0m | [36msrc.votes.build[0m:[36mbuild_entrypoints[0m:[36m103[0m - [31m[1mError building vote 407: Title not found in 407[0m
Building vote entrypoints:  64%|██████▍   | 451/706 [07:41<04:25,  1.04s/vote][32m2025-05-31 14:28:12.114[0m | [31m[1mERROR   [0m | [36msrc.votes.build[0m:[36mdownload_vote_documents[0m:[36m47[0m - [31m[1mError downloading 11/12207: 404 Client Error: Not Found for url: https://dserver.bundestag.de/btd/11/122/1112207.pdf[0m
Building vote entrypoints:  69%|██████▉   | 489/706 [08:24<02:20,  1.54vote/s]



Building vote entrypoints:  74%|███████▍  | 522/706 [09:01<04:40,  1.53s/vote][32m2025-05-31 14:29:31.692[0m | [31m[1mERROR   [0m | [36msrc.votes.build[0m:[36mbuild_entrypoints[0m:[36m103[0m - [31m[1mError building vote 522: Title not found in 522[0m
[32m2025-05-31 14:29:31.746[0m | [31m[1mERROR   [0m | [36msrc.votes.build[0m:[36mbuild_entrypoints[0m:[36m103[0m - [31m[1mError building vote 523: Title not found in 523[0m
Building vote entrypoints:  75%|███████▍  | 527/706 [09:05<03:38,  1.22s/vote][32m2025-05-31 14:29:35.958[0m | [31m[1mERROR   [0m | [36msrc.votes.build[0m:[36mbuild_entrypoints[0m:[36m103[0m - [31m[1mError building vote 527: Title not found in 527[0m
Building vote entrypoints:  80%|███████▉  | 564/706 [09:41<02:30,  1.06s/vote][32m2025-05-31 14:30:11.748[0m | [31m[1mERROR   [0m | [36msrc.votes.build[0m:[36mbuild_entrypoints[0m:[36m103[0m - [31m[1mError building vote 564: Title not found in 564[0m
Building vote entr



Building vote entrypoints:  86%|████████▌ | 606/706 [10:29<02:04,  1.25s/vote]



Building vote entrypoints:  86%|████████▌ | 607/706 [10:30<02:01,  1.23s/vote]



Building vote entrypoints:  86%|████████▌ | 608/706 [10:31<02:01,  1.24s/vote]



Building vote entrypoints:  86%|████████▋ | 609/706 [10:32<01:54,  1.18s/vote]



Building vote entrypoints:  86%|████████▋ | 610/706 [10:33<01:48,  1.13s/vote]



Building vote entrypoints:  89%|████████▉ | 628/706 [10:50<00:55,  1.40vote/s][32m2025-05-31 14:31:20.160[0m | [31m[1mERROR   [0m | [36msrc.votes.build[0m:[36mbuild_entrypoints[0m:[36m103[0m - [31m[1mError building vote 628: Title not found in 628[0m
Building vote entrypoints:  91%|█████████ | 640/706 [11:00<00:40,  1.63vote/s]



Building vote entrypoints:  91%|█████████ | 642/706 [11:03<01:02,  1.02vote/s]



Building vote entrypoints:  93%|█████████▎| 654/706 [11:18<00:59,  1.14s/vote]



Building vote entrypoints:  98%|█████████▊| 692/706 [11:50<00:15,  1.09s/vote][32m2025-05-31 14:32:20.318[0m | [31m[1mERROR   [0m | [36msrc.votes.build[0m:[36mbuild_entrypoints[0m:[36m103[0m - [31m[1mError building vote 692: Title not found in 692[0m
Building vote entrypoints:  99%|█████████▉| 698/706 [11:56<00:10,  1.28s/vote][32m2025-05-31 14:32:26.761[0m | [31m[1mERROR   [0m | [36msrc.votes.build[0m:[36mbuild_entrypoints[0m:[36m103[0m - [31m[1mError building vote 698: Title not found in 698[0m
Building vote entrypoints: 100%|██████████| 706/706 [12:01<00:00,  1.02s/vote]
[32m2025-05-31 14:32:35.063[0m | [31m[1mERROR   [0m | [36msrc.drucksachen.beschlussempfehlung[0m:[36mget_content[0m:[36m24[0m - [31m[1mBeschlussempfehlung start not found in 19/18108[0m
[32m2025-05-31 14:32:37.762[0m | [1mINFO    [0m | [36msrc.votes.build[0m:[36mbuild[0m:[36m205[0m - [1mExtracting content from drucksachen...[0m
 13%|█▎        | 84/669 [03:22<31:1