In [164]:
import pandas as pd
import numpy as np
from src.utilities import get_data_path

# Use the same file name as earlier
file_name = "2025_presidential_round2.csv"
cleaned_path = get_data_path("processed", "poland", f"{file_name[:-4]}_clean.csv")

# Load the cleaned DataFrame
df = pd.read_csv(cleaned_path, sep=";", encoding="utf-8")

print("✅ Cleaned DataFrame loaded.")

✅ Cleaned DataFrame loaded.


In [165]:
pd.set_option('display.max_columns', 100)  # or specify a number instead of None, e.g., 100
# pd.set_option('display.max_rows', 100)          # Control max rows shown
pd.set_option('display.max_rows', None)

In [166]:
# Remove empty postal codes
df = df[df["postal_code"].notna()]

# and possible zagranica:
df = df[df["teryt_gmina"].notna()]

# Clean postal codes: remove dash
df['postal_clean'] = df['postal_code'].str.replace('-', '')
df['postal_clean_value_count'] = df['postal_clean'].map(df['postal_clean'].value_counts())


In [167]:
# let's define bucket sizes
MIN_BUCKET_SIZE = 10
MAX_BUCKET_SIZE = 16

In [168]:
# 1. Początkowe grupowanie oparto na pierwszych dwóch cyfrach kodu pocztowego (np. „30” dla obszaru Krakowa). 
# 2. Jeżeli powstała grupa zawierała od 10 do 16 komisji, została zaakceptowana bez zmian. 
# There are no < 16 XX groups, so I am starting with XX-X
df["postal_3"] = df["postal_clean"].astype(str).str[:3]
df['postal_3_value_count'] = df['postal_3'].map(df['postal_3'].value_counts())

In [169]:
# Step 1: Get value counts
postal_3_counts = df['postal_3'].value_counts()

# Step 2: Find postal_3 values with count between 10 and 16
valid_postals = postal_3_counts[(postal_3_counts >= MIN_BUCKET_SIZE) & (postal_3_counts <= MAX_BUCKET_SIZE)].index

# Step 3: Assign 'bucket' column based on valid_postals
df['bucket'] = df['postal_3'].where(df['postal_3'].isin(valid_postals))

In [170]:
# df["bucket"].value_counts()

In [171]:
# df[df["bucket"] == "513"]

In [172]:
# now, same for 4 digits XX-XX (only if not already assigned)
# Step 2: Process 4-digit postal codes only where 'bucket' is still null

df["postal_4"] = df["postal_clean"].astype(str).str[:4]
postal_4_counts = df['postal_4'].value_counts()
df['postal_4_value_count'] = df['postal_4'].map(df['postal_4'].value_counts())
valid_postals_4 = postal_4_counts[
    (postal_4_counts >= MIN_BUCKET_SIZE ) & (postal_4_counts <= MAX_BUCKET_SIZE )
].index

df.loc[df['bucket'].isna(), 'bucket'] = df.loc[
    df['postal_4'].isin(valid_postals_4), 'postal_4'
]

In [173]:
# df["bucket"].value_counts()
# df[df.postal_3 == "513"]

In [174]:
# aaand 5 digits

# Step 1: Filter unbucketed rows
unbucketed = df[df['bucket'].isna()]

# Step 2: Count 5-digit codes with count > MAX_BUCKET_SIZE
large_postal_5 = unbucketed['postal_clean'].value_counts()
splittable_postals = large_postal_5[large_postal_5 > MAX_BUCKET_SIZE]

# Step 3: Loop through each splittable group
for postal_code, count in splittable_postals.items():
    # Step 3a: Try all possible k values that yield valid bucket sizes
    valid_ks = [
        k for k in range(1, count + 1)
        if MIN_BUCKET_SIZE <= count / k <= MAX_BUCKET_SIZE
    ]

    if not valid_ks:
        continue  # skip if no valid split possible

    # Step 3b: Choose the k with the most even splits
    best_k = min(
        valid_ks,
        key=lambda k: max([len(g) for g in np.array_split(np.arange(count), k)]) -
                      min([len(g) for g in np.array_split(np.arange(count), k)])
    )

    # Step 3c: Assign bucket names to each split
    indices = df[(df['postal_clean'] == postal_code) & (df['bucket'].isna())].index
    for i, chunk_indices in enumerate(np.array_split(indices, best_k), start=1):
        df.loc[chunk_indices, 'bucket'] = f"{postal_code}_{i}"


In [176]:
# df[df['bucket'].isna()].count()
# There is 16260 komisjas left

In [177]:
# df[df['bucket'].notna()].count()
# we assigned 15367

In [178]:
leftovers = df[df['bucket'].isna()].copy()
new_bucket_counter = 1

# Helper function to create chunks only if each is between MIN and MAX
def assign_buckets_by_prefix(df, prefix_col, label_prefix):
    global new_bucket_counter

    for prefix, group in df[df['bucket'].isna()].groupby(prefix_col):
        group_sorted = group.sort_values('postal_clean')
        indices = group_sorted.index.tolist()
        total = len(indices)

        # Only proceed if we can make at least one full-size bucket
        if total < MIN_BUCKET_SIZE:
            continue

        # Try to split the group into chunks of valid size
        for k in range(1, total + 1):
            avg_size = total / k
            if MIN_BUCKET_SIZE <= avg_size <= MAX_BUCKET_SIZE:
                chunks = np.array_split(indices, k)
                valid = all(MIN_BUCKET_SIZE <= len(chunk) <= MAX_BUCKET_SIZE for chunk in chunks)
                if valid:
                    for chunk in chunks:
                        df.loc[chunk, 'bucket'] = f"{label_prefix}_{prefix}_{new_bucket_counter}"
                        new_bucket_counter += 1
                break  # exit after successful split

    return df

# Apply to postal_4 first
df = assign_buckets_by_prefix(df, 'postal_4', 'merged_p4')

# Then apply to leftovers using postal_3
df = assign_buckets_by_prefix(df, 'postal_3', 'merged_p3')


In [None]:
df[df.bucket.isna()].count()

polling_station_id          1945
teryt_gmina                 1945
teryt_powiat                1945
postal_code                 1945
eligible_voters             1945
ballots_cast                1945
valid_votes                 1945
Nawrocki                    1945
Trzaskowski                 1945
postal_clean                1945
postal_clean_value_count    1945
postal_3                    1945
postal_3_value_count        1945
bucket                         0
postal_4                    1945
postal_4_value_count        1945
dtype: int64

In [184]:
df['postal_2'] = df['postal_clean'].astype(str).str[:2]
df = assign_buckets_by_prefix(df, 'postal_2', 'merged_p2')

In [187]:
df[df.bucket.isna()].count()

polling_station_id          361
teryt_gmina                 361
teryt_powiat                361
postal_code                 361
eligible_voters             361
ballots_cast                361
valid_votes                 361
Nawrocki                    361
Trzaskowski                 361
postal_clean                361
postal_clean_value_count    361
postal_3                    361
postal_3_value_count        361
bucket                        0
postal_4                    361
postal_4_value_count        361
postal_2                    361
dtype: int64

In [188]:
leftovers = df[df['bucket'].isna()].sort_values('postal_clean')
indices = leftovers.index.tolist()
total = len(indices)

# Try to split into balanced buckets
for k in range(1, total + 1):
    avg_size = total / k
    if MIN_BUCKET_SIZE <= avg_size <= MAX_BUCKET_SIZE:
        chunks = np.array_split(indices, k)
        if all(MIN_BUCKET_SIZE <= len(chunk) <= MAX_BUCKET_SIZE for chunk in chunks):
            for i, chunk in enumerate(chunks, start=1):
                df.loc[chunk, 'bucket'] = f"merged_final_{i}"
        break

In [192]:
# df[df.bucket.isna()].count()
# NO LEFTOVERS

In [193]:
# how many buckets do we have?
df['bucket'].nunique()

2518