This notebook serves to reproduce the experiments done in the main paper, and detailed in appendix B.

In [1]:
import torch as t
import pandas as pd
from train import train
from models import Transformer, AoT
from utils import generate_data, power_unif_law
from tqdm import tqdm

seed=2222
repetition=5

# Experiments

Execute the cells below to produce the data obtained in the folder Scaling laws. Experiments 1 to 4 and 6 are done exclusively on AoT, while experiment 5 involves general Transformers.

**Experiment 1**: we test the scaling in the variable $H$ (or *para* in the notebook), which is the number of heads.\
Result: the accuracy scaling law is linear in $H$.

**Experiment 2**: we test the scaling of the variable $d_{h}$, which is the dimension of each head.\
Result: the accuracy scaling law is quadratic in $d_h$.

**Experiment 3**: we test the scaling of the variable $d$, which is the embedding dimension, while maintaining $d_h$ constant.\
Result: the accuracy scaling law is linear by part for $d\leq d_{h}$ and constant afterward.

**Experiment 4**: we test the scaling of $d$, when heads have the same dimension as the residual stream. (The data used in experiment 4 is the same as experiment 1.)\
Result: we find a scaling which is cubic, which is expected using the results from experiment 2 and 3.

**Experiment 5**: we test the scaling of a Transformer with one attention head, and an MLP with varying width.\
Result: we find that the accuracy, by parameter, of the AoT is smaller than that of the MLP-based Transformer. However, the MLP-based Transformer is harder to optimize, meaning that for the same computation power, AoT can offer a greater accuracywith finite compute.

**Experiment 6**: we test the scaling in the setting of Corollary 1, with $d=2$.\
Result: we find that our solution in that case is not optimal as the observed scaling is greater by a factor 1.5.

In [5]:
""" Common hyper-parameters (exp 1-4). """
t.manual_seed(seed)

# Model parameters.
N = 50
nb_layers = 1
nb_head = 1
n_gram = 3
context_window = n_gram

# Distribution parameters.
alphas = [1., 1., 1.]
nb_tokens=[N, N, 1]
pi = power_unif_law(alphas, nb_tokens, N)

# Training parameters.
batch_size=2**8
num_batch=2**6
lr=1e-1
lr_low=lr*5e-2
epochs=2**6
Data = generate_data(batch_size=batch_size, num_batch=num_batch, pi=pi, context_window=context_window)

device="cpu" # Metal is much slower

In [None]:
""" Experiment 1. Scaling laws on H and d=d_head. """
t.manual_seed(seed)

# Scaling parameters
min_para=1
max_para=31
para_step=5
min_d=3
max_d=13
d_step=1

for d in range(min_d, max_d+1, d_step):
    print(f"Run d={d}")
    d_head=d 

    mean_accuracy = []
    para_list = []
    N_list = []
    d_list = []
    d_head_list = []

    for para in tqdm(range(min_para, max_para+1, para_step)):
        print(f"Run H={para}")
        accuracy = 0

        for _ in range(repetition):
            print(f"Run rep={_}")
            model = AoT(d, N, nb_layers, para, d_head, nb_head, context_window, pi)

            dict = train(model, Data, epochs, lr=lr, next_token=True, lr_low=lr_low)
            acc = sum(dict['Acc'][-101:-1])/100
            
            accuracy += acc

        mean_accuracy.append(accuracy/repetition)
        N_list.append(N)
        d_list.append(d)
        d_head_list.append(d_head)
        para_list.append(para)

    results = {
        'acc': mean_accuracy,
        'para': para_list,
        'N': N_list,
        'd': d_list,
        'd_head': d_head_list,
    }

    # We save the results as a dataframe.
    data = pd.DataFrame(results)
    data.to_csv(f'Scaling laws/Data_exp_1_{d}.csv', index=False)

In [None]:
""" Experiment 2. Scaling laws on d_head, with d!=d_head and H (=para) fixed. """
t.manual_seed(seed)

# Model parameters.
d = 10
para = 20

# Scaling parameters
min_d_head = 1
max_d_head = d
d_head_step = 1

mean_accuracy = []
para_list = []
N_list = []
d_list = []
d_head_list = []
for d_head in tqdm(range(min_d_head, max_d_head+1, d_head_step)):
    accuracy = 0

    for _ in range(repetition):
        model = AoT(d, N, nb_layers, para, d_head, nb_head, context_window, pi)

        dict = train(model, Data, epochs, lr=lr, next_token=True, lr_low=lr_low)
        acc = sum(dict['Acc'][-101:-1])/100
        
        accuracy += acc

    mean_accuracy.append(accuracy/repetition)
    N_list.append(N)
    d_list.append(d)
    d_head_list.append(d_head)
    para_list.append(para)

results = {
    'acc': mean_accuracy,
    'para': para_list,
    'N': N_list,
    'd': d_list,
    'd_head': d_head_list,
}

# We save the results as a dataframe.
data = pd.DataFrame(results)
data.to_csv(f'Scaling laws/Data_exp_2.csv', index=False)

In [None]:
""" Experiment 3. Scaling laws on  d, with d_head and H (=para) fixed. """
t.manual_seed(seed)

# Model parameters.
d_head = 10
para = 20

# Scaling parameters
min_d = 5
max_d = 15
d_step = 1

mean_accuracy = []
para_list = []
N_list = []
d_list = []
d_head_list = []
for d in tqdm(range(min_d, max_d+1, d_step)):
    accuracy = 0

    for _ in range(repetition):
        model = AoT(d, N, nb_layers, para, d_head, nb_head, context_window, pi)

        dict = train(model, Data, epochs, lr=lr, next_token=True, lr_low=lr_low)
        acc = sum(dict['Acc'][-101:-1])/100
            
        accuracy += acc

    mean_accuracy.append(accuracy/repetition)
    N_list.append(N)
    d_list.append(d)
    d_head_list.append(d_head)
    para_list.append(para)

results = {
    'acc': mean_accuracy,
    'para': para_list,
    'N': N_list,
    'd': d_list,
    'd_head': d_head_list,
}

# We save the results as a dataframe.
data = pd.DataFrame(results)
data.to_csv(f'Scaling laws/Data_exp_3.csv', index=False)

In [None]:
""" Experiment 5. Scaling laws on the width of Transformer using MLPs. """
t.manual_seed(seed)

# Model parameters.
N = 70
nb_layers = 1
nb_head = 1
n_gram = 3
context_window = n_gram

# Distribution parameters.
alphas = [1., 1., 1.]
nb_tokens=[N, N, 1]
pi = power_unif_law(alphas, nb_tokens, N)

# Training parameters.
batch_size=2**10
num_batch=2**8
lr=1e-1
epochs=2**6
Data = generate_data(batch_size=batch_size, num_batch=num_batch, pi=pi, context_window=context_window)

device="cpu" # Metal is much slower

for d in [7, 10, 13]: # Train regular Transformers
    d_head = d
    min_para = 1
    max_para = 31
    step_para = 5
    lr_low=lr*0.01

    mean_accuracy = []
    para_list = []
    N_list = []
    d_list = []
    d_head_list = []
    width_list = []
    for para in tqdm(range(min_para, max_para+1, step_para)):
        width = 2*d*(para-1)
        accuracy = 0

        for _ in range(repetition):
            model = Transformer(d, N, nb_layers, width, 1, d_head, nb_head, context_window, pi)

            dict = train(model, Data, epochs, lr=lr, next_token=True, lr_low=lr_low)
            acc = sum(dict['Acc'][-101:-1])/100
            
            accuracy += acc

        mean_accuracy.append(accuracy/repetition)
        N_list.append(N)
        d_list.append(d)
        d_head_list.append(d_head)
        para_list.append(para)
        width_list.append(width)

    results = {
        'acc': mean_accuracy,
        'para': para_list,
        'N': N_list,
        'd': d_list,
        'd_head': d_head_list,
        'width': width_list,
    }

    # We save the results as a dataframe.
    data = pd.DataFrame(results)
    data.to_csv(f'Scaling laws/Data_exp_5_{d}_mlp.csv', index=False)

for d in [7, 10, 13]: # Train AoT
    d_head = d
    min_para = 1
    max_para = 31
    step_para = 5
    lr_low=lr*5e-2

    mean_accuracy = []
    para_list = []
    N_list = []
    d_list = []
    d_head_list = []
    width_list = []
    for para in tqdm(range(min_para, max_para+1, step_para)):
        width = 2*d*(para-1)
        accuracy = 0

        for _ in range(repetition):
            model = Transformer(d, N, nb_layers, 0, para, d_head, nb_head, context_window, pi)

            dict = train(model, Data, epochs, lr=lr, next_token=True, lr_low=lr_low)
            acc = sum(dict['Acc'][-101:-1])/100
            
            accuracy += acc

        mean_accuracy.append(accuracy/repetition)
        N_list.append(N)
        d_list.append(d)
        d_head_list.append(d_head)
        para_list.append(para)
        width_list.append(width)

    results = {
        'acc': mean_accuracy,
        'para': para_list,
        'N': N_list,
        'd': d_list,
        'd_head': d_head_list,
        'width': width_list,
    }

    # We save the results as a dataframe.
    data = pd.DataFrame(results)
    data.to_csv(f'Scaling laws/Data_exp_5_{d}_att.csv', index=False)

In [None]:
""" Experiment 6. Scaling laws on H with N=10, d=2, d_head=d. """
t.manual_seed(seed)
repetition=20

# Model parameters.
N = 10
nb_layers = 1
nb_head = 1
n_gram = 3
context_window = n_gram

# Distribution parameters.
alphas = [1., 1., 1.]
nb_tokens=[N, N, 1]
pi = power_unif_law(alphas, nb_tokens, N)

# Training parameters.
batch_size=2**8
num_batch=2**6
lr=1e-1
lr_low=lr*5e-2
epochs=2**6
Data = generate_data(batch_size=batch_size, num_batch=num_batch, pi=pi, context_window=context_window)


# Scaling parameters
min_para=1
max_para=21
para_step=4
d=2
d_head=d

mean_accuracy = []
para_list = []
N_list = []
d_list = []
d_head_list = []

for para in tqdm(range(min_para, max_para+1, para_step)):
    accuracy = 0

    for _ in range(repetition):
        model = AoT(d, N, nb_layers, para, d_head, nb_head, context_window, pi)

        dict = train(model, Data, epochs, lr=lr, next_token=True, lr_low=lr_low)
        acc = sum(dict['Acc'][-101:-1])/100
            
        accuracy += acc

    mean_accuracy.append(accuracy/repetition)
    N_list.append(N)
    d_list.append(d)
    d_head_list.append(d_head)
    para_list.append(para)

results = {
    'acc': mean_accuracy,
    'para': para_list,
    'N': N_list,
    'd': d_list,
    'd_head': d_head_list,
}

# We save the results as a dataframe.
data = pd.DataFrame(results)
data.to_csv(f'Scaling laws/Data_exp_6.csv', index=False)

repetition=5

# Additional experiments: Large dimension

In this follow-up experiment, we reproduce experiments 1, 2 and 5 in dimension $d=50$ and with $N=200$.

In [3]:
""" Common hyper-parameters. """
t.manual_seed(seed)

# Model parameters.
N = 200
nb_layers = 1
nb_head = 1
n_gram = 3
context_window = n_gram

# Distribution parameters.
alphas = [1., 1., 1.]
nb_tokens=[N, N, 1]
pi = power_unif_law(alphas, nb_tokens, N)

# Training parameters.
batch_size=2**10
num_batch=2**6
lr=1e-1
lr_low=lr*5e-2
epochs=2**6
Data = generate_data(batch_size=batch_size, num_batch=num_batch, pi=pi, context_window=context_window)

In [None]:
""" Experiment 1. Scaling laws on H and d=d_head. """
t.manual_seed(seed)

# Scaling parameters
para_list=[1, 3, 5, 7, 9, 11]

for d in [40, 50, 60]:
    d_head=d 

    mean_accuracy = []
    N_list = []
    d_list = []
    d_head_list = []

    for para in tqdm(para_list):
        accuracy = 0.

        for _ in range(repetition):
            model = AoT(d, N, nb_layers, para, d_head, nb_head, context_window, pi)

            dict = train(model, Data, epochs, lr=lr, next_token=True, lr_low=lr_low)
            acc = sum(dict['Acc'][-101:-1])/100
            
            accuracy += acc

        mean_accuracy.append(accuracy/repetition)
        N_list.append(N)
        d_list.append(d)
        d_head_list.append(d_head)

    results = {
        'acc': mean_accuracy,
        'para': para_list,
        'N': N_list,
        'd': d_list,
        'd_head': d_head_list,
    }

    # We save the results as a dataframe.
    data = pd.DataFrame(results)
    data.to_csv(f'Scaling laws/Data_exp_1_{d}_dim.csv', index=False)

In [None]:
""" Experiment 2. Scaling laws on d_head, with d!=d_head and H (=para) fixed. """
t.manual_seed(seed)

# Scaling parameters
d = 50
para = 11
d_head_list = [1, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50]

mean_accuracy = []
para_list = []
N_list = []
d_list = []
for d_head in tqdm(d_head_list):
    accuracy = 0.

    for _ in range(repetition):
        model = AoT(d, N, nb_layers, para, d_head, nb_head, context_window, pi)

        dict = train(model, Data, epochs, lr=lr, next_token=True, lr_low=lr_low)
        acc = sum(dict['Acc'][-101:-1])/100
        
        accuracy += acc

    mean_accuracy.append(accuracy/repetition)
    N_list.append(N)
    d_list.append(d)
    para_list.append(para)

results = {
    'acc': mean_accuracy,
    'para': para_list,
    'N': N_list,
    'd': d_list,
    'd_head': d_head_list,
}

# We save the results as a dataframe.
data = pd.DataFrame(results)
data.to_csv(f'Scaling laws/Data_exp_2_dim.csv', index=False)

In [None]:
""" Experiment 5. Scaling laws on the width of Transformer using MLPs. """
t.manual_seed(seed)
lr_low=lr*0.01

for d in [40, 50, 60]:
    para = 1
    d_head = d
    min_width = 2*d*(1-1)
    max_width = 2*d*(11-1)
    step = 2*d*2

    mean_accuracy = []
    para_list = []
    N_list = []
    d_list = []
    d_head_list = []
    width_list = []
    for width in tqdm(range(min_width, max_width+1, step)):
        accuracy = 0.

        for _ in range(repetition):
            model = Transformer(d, N, nb_layers, width, 1, d_head, nb_head, context_window, pi)

            dict = train(model, Data, epochs, lr=lr, next_token=True, lr_low=lr_low)
            acc = sum(dict['Acc'][-101:-1])/100
            
            accuracy += acc

        mean_accuracy.append(accuracy/repetition)
        N_list.append(N)
        d_list.append(d)
        d_head_list.append(d_head)
        para_list.append(para)
        width_list.append(width)

    results = {
        'acc': mean_accuracy,
        'para': para_list,
        'N': N_list,
        'd': d_list,
        'd_head': d_head_list,
        'width': width_list,
    }

    # We save the results as a dataframe.
    data = pd.DataFrame(results)
    data.to_csv(f'Scaling laws/Data_exp_5_{d}_dim.csv', index=False)

# Additional experiments: Large depth

In this follow-up experiment, we reproduce experiments 1, 2 and 5 in dimension $d=10$ and with 3 layers.

In [2]:
""" Common hyper-parameters. """
t.manual_seed(seed)

# Model parameters.
N = 70
nb_layers = 3 # Depth of the network
nb_head = 1
n_gram = 3
context_window = n_gram

# Distribution parameters.
alphas = [1., 1., 1.]
nb_tokens=[N, N, 1]
pi = power_unif_law(alphas, nb_tokens, N)

# Training parameters.
batch_size=2**9
num_batch=2**7
lr=5e-2
lr_low=lr
epochs=2**6
Data = generate_data(batch_size=batch_size, num_batch=num_batch, pi=pi, context_window=context_window)

In [None]:
""" Experiment 1. Scaling laws on H with fixed d=d_head. """
t.manual_seed(seed)

# Scaling parameters
para_list=[1, 5, 9, 13, 17, 21]

for d in [7, 10, 13]:
    d_head=d
    mean_accuracy=[]
    N_list=[]
    d_list=[]
    d_head_list=[]

    for para in tqdm(para_list):
        accuracy = 0.

        for _ in range(repetition):
            model = AoT(d, N, nb_layers, para, d_head, nb_head, context_window, pi)

            dict = train(model, Data, epochs, lr=lr, next_token=True, lr_low=lr_low)
            acc = sum(dict['Acc'][-101:-1])/100
            
            accuracy += acc

        mean_accuracy.append(accuracy/repetition)
        N_list.append(N)
        d_list.append(d)
        d_head_list.append(d_head)

    results = {
        'acc': mean_accuracy,
        'para': para_list,
        'N': N_list,
        'd': d_list,
        'd_head': d_head_list,
}

    # We save the results as a dataframe.
    data = pd.DataFrame(results)
    data.to_csv(f'Scaling laws/Data_exp_1_{d}_depth.csv', index=False)

In [None]:
""" Experiment 2. Scaling laws on d_head, with d!=d_head and H (=para) fixed. """
t.manual_seed(seed)

# Model parameters.
d=10
para=21
d_head_list=[i for i in range(1, 10+1)]

# Scaling parameters
mean_accuracy = []
para_list = []
N_list = []
d_list = []
for d_head in tqdm(d_head_list):
    accuracy = 0.

    for _ in range(repetition):
        model = AoT(d, N, nb_layers, para, d_head, nb_head, context_window, pi)

        dict = train(model, Data, epochs, lr=lr, next_token=True, lr_low=lr_low)
        acc = sum(dict['Acc'][-101:-1])/100
        
        accuracy += acc

    mean_accuracy.append(accuracy/repetition)
    N_list.append(N)
    d_list.append(d)
    para_list.append(para)

results = {
    'acc': mean_accuracy,
    'para': para_list,
    'N': N_list,
    'd': d_list,
    'd_head': d_head_list,
}

# We save the results as a dataframe.
data = pd.DataFrame(results)
data.to_csv(f'Scaling laws/Data_exp_2_depth.csv', index=False)

In [None]:
""" Experiment 5. Scaling laws on the width of Transformer using MLPs. """
t.manual_seed(seed)

for d in [7, 10, 13]:
    para = 1
    d_head = d
    min_width = 2*d*(1-1)
    max_width = 2*d*(21-1)
    step = 2*d*4

    mean_accuracy = []
    para_list = []
    N_list = []
    d_list = []
    d_head_list = []
    width_list = []
    for width in tqdm(range(min_width, max_width+1, step)):
        accuracy = 0.

        for _ in range(repetition):
            model = Transformer(d, N, nb_layers, width, para, d_head, nb_head, context_window, pi)

            dict = train(model, Data, epochs, lr=lr, next_token=True, lr_low=lr_low)
            acc = sum(dict['Acc'][-101:-1])/100
            
            accuracy += acc

        mean_accuracy.append(accuracy/repetition)
        N_list.append(N)
        d_list.append(d)
        d_head_list.append(d_head)
        para_list.append(para)
        width_list.append(width)

    results = {
        'acc': mean_accuracy,
        'para': para_list,
        'N': N_list,
        'd': d_list,
        'd_head': d_head_list,
        'width': width_list,
    }

    # We save the results as a dataframe.
    data = pd.DataFrame(results)
    data.to_csv(f'Scaling laws/Data_exp_5_{d}_depth.csv', index=False)