In [10]:
import pandas as pd
from collections import Counter

models = ["deepseek_reasoner", "gpt_4_1", "gpt_4_1_mini"]
predictions = {model: pd.read_parquet(f"output/predictions_{model}.parquet") for model in models}

base_df = predictions['deepseek_reasoner'].copy()
base_df = base_df.rename(columns={'prediction': 'deepseek_reasoner_prediction'})


merged_df = pd.merge(base_df, predictions['gpt_4_1'][['vote_id', 'drucksache_id', 'party', 'prediction']],
                     on=['vote_id', 'drucksache_id', 'party'], how='left').rename(
                         columns={'prediction': 'gpt_4_1_prediction'}
                     )

merged_df = pd.merge(merged_df, predictions['gpt_4_1_mini'][['vote_id', 'drucksache_id', 'party', 'prediction']],
                     on=['vote_id', 'drucksache_id', 'party'], how='left').rename(
                         columns={'prediction': 'gpt_4_1_mini_prediction'}
                     )

def get_majority_vote(row):
    """
    Finds the majority vote from a row of predictions.
    If there's a tie, it chooses the first one.
    """
    votes = [row['deepseek_reasoner_prediction'], row['gpt_4_1_prediction'], row['gpt_4_1_mini_prediction']]
    votes = [vote for vote in votes if pd.notna(vote)] 
    
    if not votes:
        return None 
        
    counts = Counter(votes)
    max_count = max(counts.values())
    majority_choices = [vote for vote, count in counts.items() if count == max_count]
    
    return majority_choices[0]

merged_df['prediction'] = merged_df.apply(get_majority_vote, axis=1)

final_df = merged_df.copy()


In [12]:
final_df.to_parquet("output/predictions_majority_vote.parquet", index=False)