In [1]:
from model_data_train import One_hidden_layer, generate_data_exp2, train 
from utils import do_list, do_train
import torch as t
import numpy as np
import pandas as pd
from utils import nb_neurons, nb_epochs

seed = 2222

# Experiment 1 : Testing the limit of convergence

In this first experiment, we wish to understand how many sub-Gaussian examples can be randomly drawn and learned by a 1-layer ReLU network in dimension $d$, and with $p=O(\log(d))$. Thus, we learn networks with an increasing number of examples in a fixed dimension and look at the probability that the network reaches 0.

**Data:** We generated $(x_i,y_i)_{1\leq i\leq n}$ *i.i.d* from the following distributions:
* $\frac{x_i}{||x_i||} \sim \mathcal{S}^d$ and $||x_i|| \sim \mathcal{U}([1,2])$,
* $\frac{y_i}{|y_i|} \sim \mathcal{U}(\{-1,+1\})$ and $|y_i| \sim \mathcal{U}([1,2])$.

With these distributions, the data follows the assumption from Lemma 4 in the paper.

**Results:** See the analysis notebook and paper.

In [2]:
""" EXP1.0: Training networks with d=100, n in [1300, 2100], and p varying, with 80 repetition each. """
t.manual_seed(seed)

repetition=80
eps=0.05
lr=1e0

d=100
list_n=do_list(min=1300, max=2100, step=50)
list_p=[]

Loss_trend = []
CV_Probability = []
for n in list_n:
    p = nb_neurons(n, eps)
    list_p.append(p)

    # Trains network *repetition* times and average the scores.
    loss, proba = do_train(d, n, p, lr, repetition)

    Loss_trend.append(loss)
    CV_Probability.append(proba)

results = {
        'd': [d]*len(list_n),
        'n': list_n,
        'p': list_p,
        'loss_trend': Loss_trend,
        'CV_probability': CV_Probability,
    }
data = pd.DataFrame(results)
data.to_csv(f'raw experiments/Data_exp_1_{d}_bis.csv', index=False)

In [None]:
""" EXP1.1: Measuring the convergence threshold for d=10 to 100, with p=30 fixed and 20 repetitions each. """
t.manual_seed(seed)

eps=0.05
list_d=do_list(min=10, max=100, step=10)
p=30
repetition=20
lr=1e0

for d in list_d:
    list_n = do_list(min=d, max=30*d-200, step=d)

    Loss_trend = []
    CV_Probability = []
    for n in list_n:
        
        # Trains network *repetition* times and average the scores.
        loss, proba = do_train(d, n, p, lr, repetition)

        Loss_trend.append(loss)
        CV_Probability.append(proba)

    results = {
        'd': [d]*len(list_n),
        'n': list_n,
        'p': [p]*len(list_n),
        'loss_trend': Loss_trend,
        'CV_probability': CV_Probability,
    }
    data = pd.DataFrame(results)
    data.to_csv(f'raw experiments/Data_exp_1_d_{d}.csv', index=False)

In [None]:
""" EXP1.2: Measuring the convergence threshold for p=350 to 400, with d=30 fixed and 20 repetitions each. """
t.manual_seed(seed)

d=30
list_p=do_list(min=350, max=400, step=10)
list_n=do_list(min=350, max=450, step=10)
repetition=20
lr=1e0 

for p in list_p:
    Loss_trend = []
    CV_Probability = []

    for n in list_n:
        # Trains network *repetition* times and average the scores.
        loss, proba = do_train(d, n, p, lr, repetition)

        Loss_trend.append(loss)
        CV_Probability.append(proba)

    results = {
        'd': [d]*len(list_n),
        'n': list_n,
        'p': [p]*len(list_n),
        'loss_trend': Loss_trend,
        'CV_probability': CV_Probability,
    }
    data = pd.DataFrame(results)
    data.to_csv(f'raw experiments/Data_exp_1_p_{p}.csv', index=False)

# Experiment 2 : Testing the convergence speed conjecture

In this experiment, we want to show that conjecture 8 is true: the asymptotical local-PL curvature evolves as $\frac{1}{\sqrt{n}}$. We thus train networks with varying number of example ($n$ ranging from 1000 to 2000 orthogonal examples) to compute the rate at which the local-PL curvature $\mu(t)$ declines. We do so in 3 distinct ways:
* We measure the local-PL at the end of the training: $\mu(t_{\infty})$ ($t_{\infty}$ being the last epoch),
* We measure the average-PL curvature until the end of the training: $\langle \mu_{\infty} \rangle$,
* We compute lower and upper bounds for the local-PL coefficient (the ones from Lemma 12) and evaluate them at the last epoch: $\mu_{\text{low}}$ and $\mu_{\text{upp}}$.

In [None]:
""" EXP2: Measuring the convergence speed (local-PL curvature) in dimension d=2000, for n in [1000, 2000], and p varying. We perform 250 repetitions. """
t.manual_seed(seed)

d=2000
list_n=do_list(min=1000, max=2000, step=100)
repetition=250
epsilon=0.05
lr=1e0

log_Speed = []
log_avg_Speed = []
log_Lower = []
log_Upper = []
for n in list_n:
    p=nb_neurons(n, epsilon)
    epoch=nb_epochs(n, p, lr)*10
    
    speed = 0.
    avg_speed = 0.
    lower = 0.
    upper = 0.
    for rep_id in range(repetition):
        print(f"{n}: {rep_id/repetition}")
        
        # To ensure a well initialized network. 
        flag = True
        while flag:
            model = One_hidden_layer(emb_dim=d, hid_dim=p)
            model.computations[0].weight.data = t.nn.Parameter(model.computations[0].weight.data*0.01)
            data = generate_data_exp2(d, n)
            if model.is_well_init(data):
                Loss = train(model, data, lr, epoch, cv_threshold=1e-6, verbose=False)
                flag =  False

        window = 100
        mu = 2*np.mean((np.array(Loss[-2-window:-2])-np.array(Loss[-1-window:-1]))/(np.array(Loss[-1-window:-1])+np.array(Loss[-2-window:-2])+1e-10)) # change
        speed += mu/repetition # Instantaneous speed at the last epoch.

        mu_inf = np.log(Loss[0]/Loss[-1])/(len(Loss)*lr)
        avg_speed += mu_inf/repetition # Average Speed at the end of the dynamic.

        indicator_matrix = (model.computations[0](data[0]) > 0).to(t.float)
        square_a_j = (model.computations[2].weight**2).squeeze()
        energies = indicator_matrix@square_a_j/p
        max_energy = energies.max()*16/(n)
        min_energy = energies.min()*2/(n)
        lower += min_energy.item()/repetition # The lower bound is measured at the last epoch.
        upper += max_energy.item()/repetition # The upper bound is measured at the last epoch.

    log_Speed.append(np.log(speed+1e-10))
    log_avg_Speed.append(np.log(avg_speed+1e-10))
    log_Lower.append(np.log(lower+1e-10))
    log_Upper.append(np.log(upper+1e-10))

results = {
        'n': list_n,
        'd': [d]*len(list_n),
        'log_speed': log_Speed,
        'log_avg_speed': log_avg_Speed,
        'log_lower': log_Lower,
        'log_upper': log_Upper,
    }
data = pd.DataFrame(results)
data.to_csv(f'raw experiments/Data_exp_2_bis.csv', index=False)