# Estimating Bucket Counts from (n, k)
This notebook continues prior work to derive a closed-form (logarithmic) regression formula estimating the column `buckets` as a function of `n` and `k` using the dataset `df_oddEven.csv`.

We follow a continuation-oriented outline to make this reproducible and resumable.

## 1. Load Previous Environment State
If a prior serialized state file existed we would reload it here. For now, we simulate the pattern and provide hooks.


In [5]:
# Section 1: Load Previous Environment State (placeholder)
from __future__ import annotations
import json, os, math, datetime, sys
from pathlib import Path
path_state = Path('state_estimate_buckets.json')
state_loaded = {}
if path_state.exists():
    try:
        state_loaded = json.loads(path_state.read_text())
        print('Loaded prior state keys:', list(state_loaded.keys()))
    except Exception as exception:
        print('Failed to load previous state file:', exception)
else:
    print('No prior state file present; proceeding fresh.')

No prior state file present; proceeding fresh.


## 2. Re-import Libraries and Configuration
We import required standard libraries and numerical tooling. Project uses explicit module names (no abbreviations) per conventions.

In [6]:
# Section 2: Imports and configuration
import csv
import statistics
import numpy  # project prefers full name, not abbreviation
from math import log, exp
from pathlib import Path
import itertools
import pprint

numpy.set_printoptions(linewidth=130, suppress=True)
print('Python version:', sys.version)
print('Numpy version:', numpy.__version__)


Python version: 3.13.7 (tags/v3.13.7:bcee1c3, Aug 14 2025, 14:15:11) [MSC v.1944 64 bit (AMD64)]
Numpy version: 2.3.2


## 3. Reload Cached / Intermediate Data
Load the `df_oddEven.csv` dataset containing (n, k, buckets).

In [7]:
# Section 3: Load data
path_dataset = Path('df_oddEven.csv')
rows = []
with path_dataset.open() as handle:
    reader = csv.DictReader(handle)
    for row in reader:
        rows.append({key: int(value) for key, value in row.items()})
print(f'Loaded {len(rows)} rows from {path_dataset}')
print('Preview first 5 rows:')
for preview in rows[:5]:
    print(preview)

n_values = [r['n'] for r in rows]
k_values = [r['k'] for r in rows]
bucket_values = [r['buckets'] for r in rows]
assert len(n_values) == len(k_values) == len(bucket_values) > 0, 'Dataset length mismatch'

Loaded 38 rows from df_oddEven.csv
Preview first 5 rows:
{'n': 5, 'k': 3, 'buckets': 7}
{'n': 7, 'k': 3, 'buckets': 23}
{'n': 9, 'k': 5, 'buckets': 42}
{'n': 11, 'k': 5, 'buckets': 138}
{'n': 13, 'k': 7, 'buckets': 207}


## 4. Resume Parameter Definitions
Define constants and feature selection choices for the regression.

In [8]:
# Section 4: Parameters
use_feature_product = True
use_feature_ratios = True
print('Feature switches:', use_feature_product, use_feature_ratios)


Feature switches: True True


## 5. Reconstruct Random State for Reproducibility
Set seeds (not strictly required for deterministic linear algebra, but included for pattern consistency).

In [9]:
# Section 5: Random state
import random
random_seed = 1729  # Hardy-Ramanujan number for fun
random.seed(random_seed)
numpy.random.seed(random_seed)
print('Seeds set to', random_seed)

Seeds set to 1729


## 6. Continue Data Processing Pipeline Step
Construct logarithmic and ratio-based features and perform least squares fit in log space.

In [10]:
# Section 6: Feature construction and regression
n_array = numpy.array(n_values, dtype=float)
k_array = numpy.array(k_values, dtype=float)
buckets_array = numpy.array(bucket_values, dtype=float)

log_n = numpy.log(n_array)
log_k = numpy.log(k_array)
product_log = log_n * log_k
ratio_k_over_n = k_array / n_array
ratio_n_over_k = n_array / k_array

# Design matrix (full model chosen earlier)
X = numpy.column_stack([
    numpy.ones_like(n_array),
    log_n,
    log_k,
    product_log,
    ratio_k_over_n,
    ratio_n_over_k,
])
log_buckets = numpy.log(buckets_array)
coefficients, residuals, rank, singular_values = numpy.linalg.lstsq(X, log_buckets, rcond=None)
coefficients_named = {
    'bias': float(coefficients[0]),
    'log_n': float(coefficients[1]),
    'log_k': float(coefficients[2]),
    'log_n_log_k': float(coefficients[3]),
    'k_over_n': float(coefficients[4]),
    'n_over_k': float(coefficients[5]),
}
print('Coefficients:')
for key, value in coefficients_named.items():
    print(f'  {key:12s} = {value:+.12f}')

log_buckets_pred = X @ coefficients
buckets_pred = numpy.exp(log_buckets_pred)
ss_res = numpy.sum((log_buckets - log_buckets_pred) ** 2)
ss_tot = numpy.sum((log_buckets - log_buckets.mean()) ** 2)
coefficient_determination = 1 - ss_res / ss_tot
mean_absolute_percentage_error = float(numpy.mean(numpy.abs((buckets_pred - buckets_array) / buckets_array)) * 100)
print(f'R^2 (log space): {coefficient_determination:.6f}')
print(f'MAPE %: {mean_absolute_percentage_error:.2f}')

# Final pure-Python estimation function (math only) ----------------------------------
from math import log as math_log, exp as math_exp

def estimateBuckets(n: int, k: int) -> float:
    """Estimate the number of buckets for given integers n and k.

    Model form (log-space):
        log(estimate) = a
            + b * log(n)
            + c * log(k)
            + d * log(n)*log(k)
            + e * (k / n)
            + f * (n / k)

    Coefficients were obtained via ordinary least squares fit on log(buckets)
    using the dataset in df_oddEven.csv (38 observations). The regression achieved
    R^2 ≈ {coefficient_determination:.5f} (log space) with MAPE ≈ {mean_absolute_percentage_error:.2f}% on training data.

    NOTE: This is an empirical approximation; extrapolation outside the range
    n ∈ [{min(n_values)}, {max(n_values)}], k ∈ [{min(k_values)}, {max(k_values)}] may be unreliable.

    Parameters
    ----------
    n : int
        Primary size parameter (must be > 0)
    k : int
        Secondary size parameter (must be > 0)

    Returns
    -------
    float
        Estimated bucket count (positive real number). Caller may round if an
        integer is desired.
    """
    if not isinstance(n, int) or n <= 0:
        raise ValueError(f'allegedInt n must be positive int, got {n!r}')
    if not isinstance(k, int) or k <= 0:
        raise ValueError(f'allegedInt k must be positive int, got {k!r}')

    a = -679.088264366881
    b =  864.829109159972
    c = -873.871846814867
    d =    3.487829177620
    e =  943.512567960048
    f = -193.640628682536

    ln_n = math_log(n)
    ln_k = math_log(k)
    value_log = (a
                 + b * ln_n
                 + c * ln_k
                 + d * ln_n * ln_k
                 + e * (k / n)
                 + f * (n / k))
    return math_exp(value_log)

# Quick smoke test for the function on first few rows
for sample in rows[:5]:
    predicted = estimateBuckets(sample['n'], sample['k'])
    print(f"n={sample['n']:>2} k={sample['k']:>2} actual={sample['buckets']:>10} predicted={predicted:>12.1f} err%={(predicted-sample['buckets'])/sample['buckets']*100:6.2f}")


Coefficients:
  bias         = -679.088264366881
  log_n        = +864.829109159972
  log_k        = -873.871846814867
  log_n_log_k  = +3.487829177620
  k_over_n     = +943.512567960048
  n_over_k     = -193.640628682536
R^2 (log space): 0.990524
MAPE %: 39.08
n= 5 k= 3 actual=         7 predicted=         9.9 err%= 41.68
n= 7 k= 3 actual=        23 predicted=        41.9 err%= 82.27
n= 9 k= 5 actual=        42 predicted=        14.1 err%=-66.38
n=11 k= 5 actual=       138 predicted=        95.5 err%=-30.78
n=13 k= 7 actual=       207 predicted=        91.3 err%=-55.91


## 7. Append New Analysis Cells
Residual diagnostics and error summary.

In [11]:
# Section 7: Residual analysis
residuals_linear = buckets_pred - buckets_array
percent_errors = (residuals_linear / buckets_array) * 100
print('Percent error summary (in-sample):')
print('  min  %8.2f' % percent_errors.min())
print('  max  %8.2f' % percent_errors.max())
print('  mean %8.2f' % percent_errors.mean())
print('  std  %8.2f' % percent_errors.std())
# Identify largest 5 absolute percent errors
indices_sorted = numpy.argsort(numpy.abs(percent_errors))[::-1][:5]
print('\nTop 5 absolute percent errors:')
for index in indices_sorted:
    print(f"n={n_array[index]:.0f} k={k_array[index]:.0f} actual={buckets_array[index]:.0f} pred={buckets_pred[index]:.1f} err%={percent_errors[index]:.2f}")

Percent error summary (in-sample):
  min    -66.38
  max     98.19
  mean    10.47
  std     46.65

Top 5 absolute percent errors:
n=17 k=7 actual=1739 pred=3446.5 err%=98.19
n=37 k=19 actual=2052330 pred=3898808.5 err%=89.97
n=33 k=17 actual=445014 pred=839731.7 err%=88.70
n=7 k=3 actual=23 pred=41.9 err%=82.27
n=27 k=11 actual=140580 pred=256036.2 err%=82.13


## 8. Add Incremental Model Training Block
Not applicable (closed-form regression); section retained for structural consistency.

## 9. Extend Visualization Section
Generate comparative textual visualization (no plotting dependency added).

In [12]:
# Section 9: Comparative textual output
print('Sample comparative rows (every ~5th):')
for index in range(0, len(n_array), 5):
    print(f"n={int(n_array[index]):>2} k={int(k_array[index]):>2} actual={int(buckets_array[index]):>10} pred={buckets_pred[index]:>12.1f} err%={(buckets_pred[index]-buckets_array[index])/buckets_array[index]*100:6.2f}")

Sample comparative rows (every ~5th):
n= 5 k= 3 actual=         7 pred=         9.9 err%= 41.68
n=15 k= 7 actual=       723 pred=       481.8 err%=-33.36
n=21 k= 9 actual=     10072 pred=     15944.8 err%= 58.31
n=29 k=15 actual=     96475 pred=    165223.8 err%= 71.26
n=31 k=13 actual=    825471 pred=   1105433.2 err%= 33.92
n=35 k=15 actual=   4550074 pred=   4442952.2 err%= -2.35
n=39 k=17 actual=  24006862 pred=  16918036.9 err%=-29.53
n=43 k=19 actual= 122844418 pred=  61388593.2 err%=-50.03


## 10. Persist Updated Artifacts and Checkpoints
Save coefficient dictionary and basic metadata for resuming later.

In [17]:
# Section 10: Save state
state_to_save = {
    'timestamp': datetime.datetime.now().isoformat() + 'Z',
    'coefficients': coefficients_named,
    'r2_log_space': float(coefficient_determination),
    'mape_percent': float(mean_absolute_percentage_error),
    'n_min': min(n_values),
    'n_max': max(n_values),
    'k_min': min(k_values),
    'k_max': max(k_values),
}
Path('state_estimate_buckets.json').write_text(json.dumps(state_to_save, indent=2))
print('State file written with keys:', list(state_to_save.keys()))

State file written with keys: ['timestamp', 'coefficients', 'r2_log_space', 'mape_percent', 'n_min', 'n_max', 'k_min', 'k_max']


## 11. Lightweight Regression Tests for Continuation Integrity
Assert schema conformity and acceptable model drift thresholds.

In [14]:
# Section 11: Integrity tests
assert all(isinstance(v, int) and v > 0 for v in n_values), 'n contains invalid entries'
assert all(isinstance(v, int) and v > 0 for v in k_values), 'k contains invalid entries'
assert 0.9 < coefficient_determination < 1.01, 'R^2 out of plausible expected bounds'
# Allow higher MAPE given skewness, but ensure finite
assert math.isfinite(mean_absolute_percentage_error), 'MAPE not finite'
print('Basic integrity checks passed.')

Basic integrity checks passed.


## 12. Automate Rerun with a make-like Script Cell
Provide a driver utility to recompute coefficients and emit a standalone function template.

In [16]:
# Section 12: Driver utility

def regenerate_model_and_function(path_csv: str = 'df_oddEven.csv') -> dict:
    dataset_rows = []
    with Path(path_csv).open() as handle:
        dataset_reader = csv.DictReader(handle)
        for row in dataset_reader:
            dataset_rows.append({key: int(value) for key, value in row.items()})
    array_n = numpy.array([r['n'] for r in dataset_rows], dtype=float)
    array_k = numpy.array([r['k'] for r in dataset_rows], dtype=float)
    array_b = numpy.array([r['buckets'] for r in dataset_rows], dtype=float)
    ln_n = numpy.log(array_n)
    ln_k = numpy.log(array_k)
    ln_b = numpy.log(array_b)
    matrix_X = numpy.column_stack([
        numpy.ones_like(array_n), ln_n, ln_k, ln_n * ln_k, array_k / array_n, array_n / array_k
    ])
    coefficients_new, *_ = numpy.linalg.lstsq(matrix_X, ln_b, rcond=None)
    template = f"""def estimateBuckets(n: int, k: int) -> float:\n    from math import log as _log, exp as _exp\n    a,b,c,d,e,f = {tuple(float(c) for c in coefficients_new)}\n    if n<=0 or k<=0: raise ValueError('n and k must be positive')\n    ln_n=_log(n); ln_k=_log(k)\n    val = (a + b*ln_n + c*ln_k + d*ln_n*ln_k + e*(k/n) + f*(n/k))\n    return _exp(val)\n""".strip()
    return {
        'coefficients': [float(c) for c in coefficients_new],
        'function_template': template,
        'rows': len(dataset_rows),
    }

result_regen = regenerate_model_and_function()
print('Regenerated coefficients (sanity check):')
print(result_regen['coefficients'])
print('\nFunction template snippet:\n')
print(result_regen['function_template'])

Regenerated coefficients (sanity check):
[-679.0882643668813, 864.829109159972, -873.871846814867, 3.487829177620142, 943.5125679600484, -193.64062868253626]

Function template snippet:

def estimateBuckets(n: int, k: int) -> float:
    from math import log as _log, exp as _exp
    a,b,c,d,e,f = (-679.0882643668813, 864.829109159972, -873.871846814867, 3.487829177620142, 943.5125679600484, -193.64062868253626)
    if n<=0 or k<=0: raise ValueError('n and k must be positive')
    ln_n=_log(n); ln_k=_log(k)
    val = (a + b*ln_n + c*ln_k + d*ln_n*ln_k + e*(k/n) + f*(n/k))
    return _exp(val)
