In [1]:
import numpy as np

In [2]:
def generate_data(n, noise=0.0, seed=None):
    if seed is not None:
        np.random.seed(seed)
    X = np.random.randn(n)
    Y = (X > 0).astype(int)
    # Flip labels with probability noise
    flips = np.random.rand(n) < noise
    Y[flips] = 1 - Y[flips]
    return X, Y

In [3]:
def classifier(x):
    return (x>0).astype(int)

In [5]:
noise_levels = [0.0, 0.1, 0.2, 0.4]
sample_sizes = [10, 20, 50, 100, 500, 1000]
repeats = 50
test_size = 100000
results = {}

In [6]:
for noise in noise_levels:
    results[noise] = {}
    
    X_test, Y_test = generate_data(test_size, noise=noise, seed=123)
    Y_test_pred = classifier(X_test)
    true_error_estimate = np.mean(Y_test_pred != Y_test)
    
    for n in sample_sizes:
        emp_errors = []
        true_errors = []
        
        for r in range(repeats):
            X_train, Y_train = generate_data(n, noise=noise)
            Y_train_pred = classifier(X_train)
            
            emp_error = np.mean(Y_train_pred != Y_train)
            emp_errors.append(emp_error)
            true_errors.append(true_error_estimate)
        
        results[noise][n] = {
            "avg_empirical_error": np.mean(emp_errors),
            "avg_true_error": np.mean(true_errors)
        }

In [7]:
for noise in noise_levels:
    print(f"\nNoise = {noise}")
    for n in sample_sizes:
        res = results[noise][n]
        print(f"n={n:4d} | empirical error={res['avg_empirical_error']:.3f} "
              f"| true error={res['avg_true_error']:.3f}")


Noise = 0.0
n=  10 | empirical error=0.000 | true error=0.000
n=  20 | empirical error=0.000 | true error=0.000
n=  50 | empirical error=0.000 | true error=0.000
n= 100 | empirical error=0.000 | true error=0.000
n= 500 | empirical error=0.000 | true error=0.000
n=1000 | empirical error=0.000 | true error=0.000

Noise = 0.1
n=  10 | empirical error=0.118 | true error=0.099
n=  20 | empirical error=0.096 | true error=0.099
n=  50 | empirical error=0.100 | true error=0.099
n= 100 | empirical error=0.095 | true error=0.099
n= 500 | empirical error=0.100 | true error=0.099
n=1000 | empirical error=0.099 | true error=0.099

Noise = 0.2
n=  10 | empirical error=0.226 | true error=0.199
n=  20 | empirical error=0.192 | true error=0.199
n=  50 | empirical error=0.201 | true error=0.199
n= 100 | empirical error=0.192 | true error=0.199
n= 500 | empirical error=0.200 | true error=0.199
n=1000 | empirical error=0.200 | true error=0.199

Noise = 0.4
n=  10 | empirical error=0.418 | true error=0.40