In [88]:
import pandas as pd
import numpy as np
from src.utilities import get_data_path

# Use the same file name as earlier
file_name = "2025_presidential_round2.csv"
cleaned_path = get_data_path("processed", "poland", f"{file_name[:-4]}_clean.csv")

# Load the cleaned DataFrame
df = pd.read_csv(cleaned_path, sep=";", encoding="utf-8")

print("✅ Cleaned DataFrame loaded.")

✅ Cleaned DataFrame loaded.


In [89]:
pd.set_option('display.max_columns', 100)  # or specify a number instead of None, e.g., 100
# pd.set_option('display.max_rows', 100)          # Control max rows shown
pd.set_option('display.max_rows', None)

In [90]:
# Remove empty postal codes
df = df[df["postal_code"].notna()]

# and possible zagranica:
df = df[df["teryt_gmina"].notna()]

In [91]:
# let's define bucket sizes
MIN_BUCKET_SIZE = 10
MAX_BUCKET_SIZE = 16

In [92]:
# Clean postal codes: remove dash
df['postal_clean'] = df['postal_code'].str.replace('-', '')
df['postal_clean_value_count'] = df['postal_clean'].map(df['postal_clean'].value_counts())

df['postal_2'] = df['postal_clean'].astype(str).str[:2]
df['postal_3'] = df['postal_clean'].astype(str).str[:3]
df['postal_3_value_count'] = df['postal_3'].map(df['postal_3'].value_counts())
df['postal_4'] = df['postal_clean'].astype(str).str[:4]
df['postal_4_value_count'] = df['postal_4'].map(df['postal_4'].value_counts())

In [93]:
# Step 1: Get value counts
postal_3_counts = df['postal_3'].value_counts()

# Step 2: Find postal_3 values with count between 10 and 16
valid_postals = postal_3_counts[(postal_3_counts >= MIN_BUCKET_SIZE) & (postal_3_counts <= MAX_BUCKET_SIZE)].index

# Step 3: Assign 'bucket' column based on valid_postals
df['bucket'] = df['postal_3'].where(df['postal_3'].isin(valid_postals))

In [94]:
postal_4_counts = df['postal_4'].value_counts()
valid_postals_4 = postal_4_counts[
    (postal_4_counts >= MIN_BUCKET_SIZE ) & (postal_4_counts <= MAX_BUCKET_SIZE )
].index

df.loc[df['bucket'].isna(), 'bucket'] = df.loc[
    df['postal_4'].isin(valid_postals_4), 'postal_4'
]

In [95]:
# df.head()
# df[df.bucket.notna()]
# df[df.bucket=="5973"]

In [97]:
# Step 5: Bucket remaining by postal_clean (postal_5) if count >= 10
postal_clean_counts = df['postal_clean'].value_counts()
valid_postal_5 = postal_clean_counts[postal_clean_counts >= MIN_BUCKET_SIZE].index

mask = df['bucket'].isna() & df['postal_clean'].isin(valid_postal_5)
df.loc[mask, 'bucket'] = df.loc[mask, 'postal_clean']

In [98]:
# df.bucket.head(100)
# df[df.bucket == "58260"]

# df[df.postal_4 == "5826"]

In [99]:
df[df.bucket.isna()].count()

polling_station_id          11463
teryt_gmina                 11463
teryt_powiat                11463
postal_code                 11463
eligible_voters             11463
ballots_cast                11463
valid_votes                 11463
Nawrocki                    11463
Trzaskowski                 11463
postal_clean                11463
postal_clean_value_count    11463
postal_2                    11463
postal_3                    11463
postal_3_value_count        11463
postal_4                    11463
postal_4_value_count        11463
bucket                          0
dtype: int64

In [100]:
# Reset bucket_counter if needed
bucket_counter = 1

# Helper function to merge leftovers within prefix
def merge_leftover_chunks(df, prefix_col, label_prefix, start_counter):
    global_counter = start_counter
    for prefix, group in df[df['bucket'].isna()].groupby(prefix_col):
        group = group.sort_values('postal_clean')
        indices = group.index.tolist()
        total = len(indices)
        i = 0

        while i < total:
            for size in range(MAX_BUCKET_SIZE, MIN_BUCKET_SIZE - 1, -1):
                if i + size <= total:
                    chunk = indices[i:i+size]
                    df.loc[chunk, 'bucket'] = f"{label_prefix}_{prefix}_{global_counter}"
                    global_counter += 1
                    i += size
                    break
            else:
                break  # leave small remainder for next level
    return df, global_counter

# Step 1: Try merging leftovers with postal_4
df, bucket_counter = merge_leftover_chunks(df, 'postal_4', 'merged_p4', bucket_counter)

# Step 2: Try merging remaining leftovers with postal_3
df, bucket_counter = merge_leftover_chunks(df, 'postal_3', 'merged_p3', bucket_counter)

# Step 3: Fallback to postal_2 for remaining leftovers
df, bucket_counter = merge_leftover_chunks(df, 'postal_2', 'merged_p2', bucket_counter)


In [101]:
df[df.bucket.isna()].count()

polling_station_id          288
teryt_gmina                 288
teryt_powiat                288
postal_code                 288
eligible_voters             288
ballots_cast                288
valid_votes                 288
Nawrocki                    288
Trzaskowski                 288
postal_clean                288
postal_clean_value_count    288
postal_2                    288
postal_3                    288
postal_3_value_count        288
postal_4                    288
postal_4_value_count        288
bucket                        0
dtype: int64

In [102]:
# 290 leftovers:
leftovers = df[df['bucket'].isna()].sort_values('postal_clean')
indices = leftovers.index.tolist()
total = len(indices)

for k in range(1, total + 1):
    avg = total / k
    if MIN_BUCKET_SIZE <= avg <= MAX_BUCKET_SIZE:
        chunks = np.array_split(indices, k)
        if all(MIN_BUCKET_SIZE <= len(c) <= MAX_BUCKET_SIZE for c in chunks):
            for i, chunk in enumerate(chunks, start=1):
                df.loc[chunk, 'bucket'] = f"merged_final_{i}"
        break

In [103]:
df.bucket.value_counts()

bucket
87100                 126
26600                 113
43300                  77
87800                  72
82300                  70
66400                  69
76200                  67
43100                  67
33100                  67
22400                  60
33300                  59
62800                  58
59300                  58
86300                  55
63400                  53
08110                  52
22100                  51
88100                  50
21400                  50
27400                  50
59220                  50
18400                  48
43600                  48
21500                  46
32020                  45
37700                  45
73110                  45
41500                  45
95100                  45
59700                  44
62200                  43
37450                  43
41800                  42
42400                  42
38200                  41
97300                  41
44100                  40
12100                  39
99300

In [104]:
df.bucket.nunique()

1957

In [105]:
# Get value counts per bucket
bucket_sizes = df['bucket'].value_counts()

# Count how many buckets fall into each range
buckets_10_16 = bucket_sizes[(bucket_sizes >= 10) & (bucket_sizes <= 16)].count()
buckets_6_30  = bucket_sizes[(bucket_sizes >= 6) & (bucket_sizes <= 30)].count()

# Optional: total number of buckets
total_buckets = bucket_sizes.count()

# Display results
print(f"Buckets with 10–16 items: {buckets_10_16} ({buckets_10_16 / total_buckets:.1%})")
print(f"Buckets with 6–30 items:  {buckets_6_30} ({buckets_6_30 / total_buckets:.1%})")


Buckets with 10–16 items: 1611 (82.3%)
Buckets with 6–30 items:  1868 (95.5%)
