In [1]:
from model_data_train import One_hidden_layer, generate_data_exp1, generate_data_exp2, train 
import torch as t
import numpy as np
import pandas as pd
from utils import nb_neurons, nb_epochs

seed = 2222

# Experiment 1 : Testing the limit of convergence

In this first experiment, we which to understand how many sub-Gaussian examples can be randomly drawn and learned by a 1-layer ReLU network in dimension $d$, and with $p=O(\log(d))$. Thus, we learn networks with an increasing number of examples in a fixed dimension and look at the probability that the network reaches 0.

**Data:** We generated $(x_i,y_i)_{1\leq i\leq n}$ *i.i.d* from the following distributions:
* $\frac{x_i}{||x_i||} \sim \mathcal{S}^d$ and $||x_i|| \sim \mathcal{U}([1,2])$,
* $\frac{y_i}{|y_i|} \sim \mathcal{U}(\{-1,+1\})$ and $|y_i| \sim \mathcal{U}([1,2])$.
With these distributions, the data follows the assumption from Lemma 4.

**Results:** See the analysis notebook and paper.

In [2]:
""" EXP1.0: Training networks with d=100, n in [2500, 3500], and p varying, with 80 repetition each. """
t.manual_seed(seed)

d=100
list_n = [n for n in range(1300, 2100+50, 50)]
repetition=80
eps=0.05
lr=1e0

Loss_trend = []
CV_Probability = []
for n in list_n:
    p = nb_neurons(n, eps)
    epoch = nb_epochs(n, p, lr) # This value corresponds to 1.5 times the phase transition threshold.
    loss = 0.
    proba = 0.
    interaction = 0.
    for _ in range(repetition):
        model = One_hidden_layer(emb_dim=d, hid_dim=p)
        data = generate_data_exp1(d, n)
        Loss = train(model, data, lr, epoch, verbose=False)
        print(n)

        if Loss[-1] <= p/(2*n):
            loss += 0.
            proba += 1/repetition
        else:
            loss += Loss[-1]/(repetition*Loss[0])
            proba += 0.


    Loss_trend.append(loss)
    CV_Probability.append(proba)

results = {
        'n': list_n,
        'd': [d]*len(list_n),
        'loss_trend': Loss_trend,
        'CV_probability': CV_Probability,
    }
data = pd.DataFrame(results)
data.to_csv(f'raw experiments/Data_exp_1_{d}_bis.csv', index=False)

1300
1300
1300
1300
1300
1300
1300
1300
1300
1300
1300
1300
1300
1300
1300
1300
1300
1300
1300
1300
1300
1300
1300
1300
1300
1300
1300
1300
1300
1300
1300
1300
1300
1300
1300
1300
1300
1300
1300
1300
1300
1300
1300
1300
1300
1300
1300
1300
1300
1300
1300
1300
1300
1300
1300
1300
1300
1300
1300
1300
1300
1300
1300
1300
1300
1300
1300
1300
1300
1300
1300
1300
1300
1300
1300
1300
1300
1300
1300
1300
1350
1350
1350
1350
1350
1350
1350
1350
1350
1350
1350
1350
1350
1350
1350
1350
1350
1350
1350
1350
1350
1350
1350
1350
1350
1350
1350
1350
1350
1350
1350
1350
1350
1350
1350
1350
1350
1350
1350
1350
1350
1350
1350
1350
1350
1350
1350
1350
1350
1350
1350
1350
1350
1350
1350
1350
1350
1350
1350
1350
1350
1350
1350
1350
1350
1350
1350
1350
1350
1350
1350
1350
1350
1350
1350
1350
1350
1350
1350
1350
1400
1400
1400
1400
1400
1400
1400
1400
1400
1400
1400
1400
1400
1400
1400
1400
1400
1400
1400
1400
1400
1400
1400
1400
1400
1400
1400
1400
1400
1400
1400
1400
1400
1400
1400
1400
1400
1400
1400
1400


In [None]:
""" EXP1.1: Measuring the convergence threshold for d=10 to 100, with p=30 fixed and 20 repetitions each. """
t.manual_seed(seed)

eps=0.05
d_max = 100
n_max = 25*d_max
p=30
repetition=20
lr=1e0

for d in range(10, d_max+10, 10):
    mean_n = 15*d-200
    var_n = 15*d
    list_n = [n for n in range(max(mean_n-var_n, d), mean_n+var_n, d)]

    Loss_trend = []
    CV_Probability = []
    for n in list_n:
        epoch = nb_epochs(n, p, lr)
        loss = 0.
        proba = 0.
        interaction = 0.
        for _ in range(repetition):
            model = One_hidden_layer(emb_dim=d, hid_dim=p)
            data = generate_data_exp1(d, n)
            Loss = train(model, data, lr, epoch, verbose=False)

            initial_loss = p*t.sum(data[1]**2)/(2*n)

            if Loss[-1] <= p/(2*n):
                loss += 0.
                proba += 1/repetition
            else:
                loss += Loss[-1]/(repetition*initial_loss)
                proba += 0.

        Loss_trend.append(loss)
        CV_Probability.append(proba)

    results = {
        'n': list_n,
        'p': [p]*len(list_n),
        'd': [d]*len(list_n),
        'loss_trend': Loss_trend,
        'CV_probability': CV_Probability,
    }
    data = pd.DataFrame(results)
    data.to_csv(f'raw experiments/Data_exp_1_d_{d}.csv', index=False)

In [None]:
""" EXP1.2: Measuring the convergence threshold for p=350 to 400, with d=30 fixed and 20 repetitions each. """
t.manual_seed(seed)

d=30
repetition=20
lr=1e0 
for p in range(350, 400+10, 10):
    list_n = [n for n in range(300, 450+10, 10)]

    Loss_trend = []
    CV_Probability = []
    for n in list_n:
        epoch = nb_epochs(n, p, lr)
        loss = 0.
        proba = 0.
        interaction = 0.
        for _ in range(repetition):
            model = One_hidden_layer(emb_dim=d, hid_dim=p)
            data = generate_data_exp1(d, n)
            Loss = train(model, data, lr, epoch, verbose=False)

            initial_loss = p*t.sum(data[1]**2)/(2*n)

            if Loss[-1] <= p/(2*n):
                loss += 0.
                proba += 1/repetition
            else:
                loss += Loss[-1]/(repetition*initial_loss)
                proba += 0.

        Loss_trend.append(loss)
        CV_Probability.append(proba)

    results = {
        'n': list_n,
        'd': [d]*len(list_n),
        'p': [p]*len(list_n),
        'loss_trend': Loss_trend,
        'CV_probability': CV_Probability,
    }
    data = pd.DataFrame(results)
    data.to_csv(f'raw experiments/Data_exp_1_p_{p}.csv', index=False)

# Experiment 2 : Testing the convergence speed conjecture

In [None]:
""" EXP2: Measuring the convergence speed (local-PL curvature) in dimension d=2000, for n in [1000, 2000], and p varying. We perform 250 repetitions. """
t.manual_seed(seed)

d = 2000
min_n = 1000
max_n = 2000
step_n = 100
list_n = [n for n in range(min_n, max_n+step_n, step_n)]
repetition=250
epsilon=0.05
lr=1e0

log_Speed = []
log_avg_Speed = []
log_Lower = []
log_Upper = []
for n in list_n:
    p=nb_neurons(n, eps)
    epoch=nb_epochs(n, p, lr)
    
    speed = 0.
    avg_speed = 0.
    lower = 0.
    upper = 0.
    for _ in range(repetition):
        print(f"{n}: {_/repetition}")
        flag = True
        while flag: # To assure a well initialized network. 
            model = One_hidden_layer(emb_dim=d, hid_dim=p)
            data = generate_data_exp2(d, n)
            if model.is_well_init(data):
                Loss = train(model, data, lr, epoch, cv_threshold=1e-6, verbose=False)
                flag =  False

        window = 100
        mu = np.mean(np.log(np.array(Loss[-2-window:-2])/np.array(Loss[-1-window:-1])+1e-10)/(1*lr))
        speed += mu/repetition # Instantaneous speed at the last epoch.

        mu_inf = np.log(Loss[0]/Loss[-1])/(len(Loss)*lr)
        avg_speed += mu_inf/repetition # Average Speed at the end of the dynamic.

        indicator_matrix = (model.computations[0](data[0]) > 0).to(t.float)
        square_a_j = (model.computations[2].weight**2).squeeze()
        energies = indicator_matrix@square_a_j/p
        max_energy = energies.max()*16/(n)
        min_energy = energies.min()*2/(n)
        lower += min_energy.item()/repetition # The lower bound is measured at the last epoch.
        upper += max_energy.item()/repetition # The upper bound is measured at the last epoch.

    log_Speed.append(np.log(speed+1e-10))
    log_avg_Speed.append(np.log(avg_speed+1e-10))
    log_Lower.append(np.log(lower+1e-10))
    log_Upper.append(np.log(upper+1e-10))

results = {
        'n': list_n,
        'd': [d]*len(list_n),
        'log_speed': log_Speed,
        'log_avg_speed': log_avg_Speed,
        'log_lower': log_Lower,
        'log_upper': log_Upper,
    }
data = pd.DataFrame(results)
data.to_csv(f'raw experiments/Data_exp_2_log.csv', index=False)