This notebook serves to reproduce the experiments done in the main paper, and detailed in appendix B.

In [1]:
import torch as t
t.set_num_threads(8)
import pandas as pd
from train import train
from models import Transformer, AoT
from utils import generate_data, power_unif_law
from tqdm import tqdm

seed=2222
repetition=2 

# Experiments

Execute the cells below to produce the data obtained in the folder Scaling laws. Experiments 1 to 4 are done exclusively on AoT, while experiment 5 involves general Transformers.

**Experiment 1**: we test the scaling in the variable $H$ (or *para* in the notebook), which is the number of heads.
Result: the scaling is found to be linear, as expected.

**Experiment 2**: we test the scaling of the variable $d_{h}$, which is the dimension of each head.
Result: the scaling is linear as expected.

**Experiment 3**: we test the scaling of the variable $d$, which is the embedding dimension.
Result: the scaling is linear by part, being separated at $d=d_{h}$. The second linear scale is noisy, meaning that this might be an optimization issue.

**Experiment 4**: we test the scaling of $d=d_h$, when heads have the same dimension as the residual stream. (The data used in experiment 4 is the same as experiment 1.)
Result: We find a scaling which is cubic. The scaling as expected to be at least quadratic, using the results from experiment 2 and 3. The scaling being cubic might suggest that experiment 3 had indeed optimization issues, which were uplifted by taking $d=d_h$.

**Experiment 5**: we test the scaling of a Transformer with one attention head, and an MLP with varying width.
Result: we find that the accuracy, by parameter, of both the AoT and the MLP-based Transformer are equivalent.

**Experiement 6**: we test the scaling in the setting of Corollary 1, with $d=2$. We find that our solution in that case is not optimal as the observed scaling is greater.

In [2]:
""" Common hyper-parameters (exp 1-5). """
t.manual_seed(seed)

# Model parameters.
N = 50
nb_layers = 1
nb_head = 1
n_gram = 3
context_window = n_gram

# Distribution parameters.
alphas = [1., 1., 1.]
nb_tokens=[N, N, 1]
pi = power_unif_law(alphas, nb_tokens, N)

# Training parameters.
batch_size=2**10
num_batch=1000
lr=1e-3
epochs=10
Data = generate_data(batch_size=batch_size, num_batch=num_batch, pi=pi, context_window=context_window)

In [None]:
""" Experiment 1. Scaling laws on H and d=d_head. """
t.manual_seed(seed)

# Scaling parameters
min_para=1
max_para=31
para_step=5
min_d=3
max_d=13
d_step=1

for i, d in enumerate(range(min_d, max_d+1, d_step)):
    print(f"Run d={d}")
    d_head=d 

    mean_accuracy = []
    para_list = []
    N_list = []
    d_list = []
    d_head_list = []

    for para in tqdm(range(min_para, max_para+1, para_step)):
        print(f"Run H={para}")
        accuracy = 0

        for _ in range(repetition):
            print(f"Run rep={_}")
            model = AoT(d, N, nb_layers, para, d_head, nb_head, context_window, pi)

            dict = train(model, Data, epochs, lr=lr, next_token=True)
            acc = sum(dict['Acc'][-101:-1])/100
            
            accuracy += acc

        mean_accuracy.append(accuracy/repetition)
        N_list.append(N)
        d_list.append(d)
        d_head_list.append(d_head)
        para_list.append(para)

    results = {
        'acc': mean_accuracy,
        'para': para_list,
        'N': N_list,
        'd': d_list,
        'd_head': d_head_list,
    }

    # We save the results as a dataframe.
    data = pd.DataFrame(results)
    data.to_csv(f'Scaling laws/Data_exp_1_{i}.csv', index=False)

Run d=3


  0%|          | 0/7 [00:00<?, ?it/s]

Run H=1
Run rep=0
Run rep=1


 14%|█▍        | 1/7 [01:03<06:22, 63.70s/it]

Run H=6
Run rep=0
Run rep=1


 29%|██▊       | 2/7 [03:20<08:52, 106.45s/it]

Run H=11
Run rep=0
Run rep=1


 43%|████▎     | 3/7 [08:25<13:09, 197.31s/it]

Run H=16
Run rep=0
Run rep=1


 57%|█████▋    | 4/7 [13:17<11:43, 234.53s/it]

Run H=21
Run rep=0


In [None]:
""" Experiment 2. Scaling laws on d_head, with d!=d_head and H (=para) fixed. """
t.manual_seed(seed)

# Model parameters.
d = 10
para = 20

# Scaling parameters
min_d_head = 1
max_d_head = d
d_head_step = 1

mean_accuracy = []
para_list = []
N_list = []
d_list = []
d_head_list = []
for d_head in tqdm(range(min_d_head, max_d_head+1, d_head_step)):
    accuracy = 0

    for _ in range(repetition):
        model = AoT(d, N, nb_layers, para, d_head, nb_head, context_window, pi)

        dict = train(model, Data, epochs, lr=lr, next_token=True)
        acc = sum(dict['Acc'][-101:-1])/100
        
        accuracy += acc

    mean_accuracy.append(accuracy/repetition)
    N_list.append(N)
    d_list.append(d)
    d_head_list.append(d_head)
    para_list.append(para)

results = {
    'acc': mean_accuracy,
    'para': para_list,
    'N': N_list,
    'd': d_list,
    'd_head': d_head_list,
}

# We save the results as a dataframe.
data = pd.DataFrame(results)
data.to_csv(f'Scaling laws/Data_exp_2.csv', index=False)

In [None]:
""" Experiment 3. Scaling laws on  d, with d_head and H (=para) fixed. """
t.manual_seed(seed)

# Model parameters.
d_head = 10
para = 20

# Scaling parameters
min_d = 5
max_d = 15
d_step = 1

mean_accuracy = []
para_list = []
N_list = []
d_list = []
d_head_list = []
for d in tqdm(range(min_d, max_d+1, d_step)):
    accuracy = 0

    for _ in range(repetition):
        model = AoT(d, N, nb_layers, para, d_head, nb_head, context_window, pi)

        dict = train(model, Data, epochs, lr=lr, next_token=True)
        acc = sum(dict['Acc'][-101:-1])/100
            
        accuracy += acc

    mean_accuracy.append(accuracy/repetition)
    N_list.append(N)
    d_list.append(d)
    d_head_list.append(d_head)
    para_list.append(para)

results = {
    'acc': mean_accuracy,
    'para': para_list,
    'N': N_list,
    'd': d_list,
    'd_head': d_head_list,
}

# We save the results as a dataframe.
data = pd.DataFrame(results)
data.to_csv(f'Scaling laws/Data_exp_3.csv', index=False)

In [None]:
""" Experiment 5. Scaling laws on the width of Transformer using MLPs. """
t.manual_seed(seed)

# Model parameters.
para = 1

for d, exp_num in zip([5, 7, 10, 13], [2, 4, 7, 10]):
    d_head = d
    min_width = 2*d*(1-1)
    max_width = 2*d*(26-1)
    step = 2*d*5

    mean_accuracy = []
    para_list = []
    N_list = []
    d_list = []
    d_head_list = []
    width_list = []
    for width in tqdm(range(min_width, max_width+1, step)):
        accuracy = 0

        for _ in range(repetition):
            model = Transformer(d, N, nb_layers, width, para, d_head, nb_head, context_window, pi)

            dict = train(model, Data, epochs, lr=lr, next_token=True)
            acc = sum(dict['Acc'][-101:-1])/100
            
            accuracy += acc
            print(accuracy)

        mean_accuracy.append(accuracy/repetition)
        N_list.append(N)
        d_list.append(d)
        d_head_list.append(d_head)
        para_list.append(para)
        width_list.append(width)

    results = {
        'acc': mean_accuracy,
        'para': para_list,
        'N': N_list,
        'd': d_list,
        'd_head': d_head_list,
        'width': width_list,
    }

    # We save the results as a dataframe.
    data = pd.DataFrame(results)
    data.to_csv(f'Scaling laws/Data_exp_5_{exp_num}.csv', index=False)

In [None]:
""" Experiment 6. Scaling laws on H with N=10, d=2, d_head=5. """
t.manual_seed(seed)

# Model parameters.
N = 10
nb_layers = 1
nb_head = 1
n_gram = 3
context_window = n_gram

# Distribution parameters.
alphas = [1., 1., 1.]
nb_tokens=[N, N, 1]
pi = power_unif_law(alphas, nb_tokens, N)

# Training parameters.
batch_size=2**10
num_batch=1000
lr=1e-3
epochs=15
Data = generate_data(batch_size=batch_size, num_batch=num_batch, pi=pi, context_window=context_window)

# Scaling parameters
min_para=1
max_para=21
para_step=4
d=2
d_head=5
d_step=1

mean_accuracy = []
para_list = []
N_list = []
d_list = []
d_head_list = []

for para in tqdm(range(min_para, max_para+1, para_step)):
    accuracy = 0

    for _ in range(repetition):
        model = AoT(d, N, nb_layers, para, d_head, nb_head, context_window, pi)

        dict = train(model, Data, epochs, lr=lr, next_token=True)
        acc = sum(dict['Acc'][-101:-1])/100
            
        accuracy += acc

    mean_accuracy.append(accuracy/repetition)
    N_list.append(N)
    d_list.append(d)
    d_head_list.append(d_head)
    para_list.append(para)

results = {
    'acc': mean_accuracy,
    'para': para_list,
    'N': N_list,
    'd': d_list,
    'd_head': d_head_list,
}

# We save the results as a dataframe.
data = pd.DataFrame(results)
data.to_csv(f'Scaling laws/Data_exp_6.csv', index=False)

# Additional experiments: Large dimension

In [None]:
""" Common hyper-parameters. """
t.manual_seed(seed)

# Model parameters.
N = 200
nb_layers = 1
nb_head = 1
n_gram = 3
context_window = n_gram

# Distribution parameters.
alphas = [1., 1., 1.]
nb_tokens=[N//2, N//2, 1]
pi = power_unif_law(alphas, nb_tokens, N)

# Training parameters.
batch_size=2**10
num_batch=1000
lr=1e-3
epochs=10
Data = generate_data(batch_size=batch_size, num_batch=num_batch, pi=pi, context_window=context_window)

In [None]:
""" Experiment 1. Scaling laws on H and d=d_head. """
t.manual_seed(seed)

# Scaling parameters
d = 50
d_head=d 
para_list=[1, 3, 5, 7, 9, 11]

mean_accuracy = []
N_list = []
d_list = []
d_head_list = []

for para in tqdm(para_list):
    accuracy = 0.

    for _ in range(repetition):
        model = AoT(d, N, nb_layers, para, d_head, nb_head, context_window, pi)

        dict = train(model, Data, epochs, lr=lr, next_token=True)
        acc = sum(dict['Acc'][-101:-1])/100
            
        accuracy += acc

    mean_accuracy.append(accuracy/repetition)
    N_list.append(N)
    d_list.append(d)
    d_head_list.append(d_head)

results = {
    'acc': mean_accuracy,
    'para': para_list,
    'N': N_list,
    'd': d_list,
    'd_head': d_head_list,
}

# We save the results as a dataframe.
data = pd.DataFrame(results)
data.to_csv(f'Scaling laws/Data_exp_1_{7}_dim.csv', index=False)

In [None]:
""" Experiment 2. Scaling laws on d_head, with d!=d_head and H (=para) fixed. """
t.manual_seed(seed)

# Scaling parameters
d = 50
para = 8
d_head_list = [1, 10, 20, 30, 40, 50]

mean_accuracy = []
para_list = []
N_list = []
d_list = []
for d_head in tqdm(d_head_list):
    accuracy = 0.

    for _ in range(repetition):
        model = AoT(d, N, nb_layers, para, d_head, nb_head, context_window, pi)

        dict = train(model, Data, epochs, lr=lr, next_token=True)
        acc = sum(dict['Acc'][-101:-1])/100
        
        accuracy += acc

    mean_accuracy.append(accuracy/repetition)
    N_list.append(N)
    d_list.append(d)
    para_list.append(para)

results = {
    'acc': mean_accuracy,
    'para': para_list,
    'N': N_list,
    'd': d_list,
    'd_head': d_head_list,
}

# We save the results as a dataframe.
data = pd.DataFrame(results)
data.to_csv(f'Scaling laws/Data_exp_2_dim.csv', index=False)

In [None]:
""" Experiment 5. Scaling laws on the width of Transformer using MLPs. """
t.manual_seed(seed)

for d, exp_num in zip([40, 50, 60], [4, 7, 10]):
    para = 1
    d_head = d
    min_width = 2*d*(1-1)
    max_width = 2*d*(11-1)
    step = 2*d*2

    mean_accuracy = []
    para_list = []
    N_list = []
    d_list = []
    d_head_list = []
    width_list = []
    for width in tqdm(range(min_width, max_width+1, step)):
        accuracy = 0.

        for _ in range(repetition):
            model = Transformer(d, N, nb_layers, width, para, d_head, nb_head, context_window, pi)

            dict = train(model, Data, epochs, lr=lr, next_token=True)
            acc = sum(dict['Acc'][-101:-1])/100
            
            accuracy += acc

        mean_accuracy.append(accuracy/repetition)
        N_list.append(N)
        d_list.append(d)
        d_head_list.append(d_head)
        para_list.append(para)
        width_list.append(width)

    results = {
        'acc': mean_accuracy,
        'para': para_list,
        'N': N_list,
        'd': d_list,
        'd_head': d_head_list,
        'width': width_list,
    }

    # We save the results as a dataframe.
    data = pd.DataFrame(results)
    data.to_csv(f'Scaling laws/Data_exp_5_{exp_num}_dim.csv', index=False)


for d, exp_num in zip([40, 60], [4, 10]):
    d_head = d
    min_para = 1
    max_para = 11
    step = 2

    mean_accuracy = []
    para_list = []
    N_list = []
    d_list = []
    d_head_list = []
    width_list = []
    for para in tqdm(range(min_para, max_para+1, step)):
        accuracy = 0.

        for _ in range(repetition):
            model = AoT(d, N, nb_layers, para, d_head, nb_head, context_window, pi)

            dict = train(model, Data, epochs, lr=lr, next_token=True)
            acc = sum(dict['Acc'][-101:-1])/100
            
            accuracy += acc

        mean_accuracy.append(accuracy/repetition)
        N_list.append(N)
        d_list.append(d)
        d_head_list.append(d_head)
        para_list.append(para)
        width_list.append(width)

    results = {
        'acc': mean_accuracy,
        'para': para_list,
        'N': N_list,
        'd': d_list,
        'd_head': d_head_list,
        'width': width_list,
    }

    # We save the results as a dataframe.
    data = pd.DataFrame(results)
    data.to_csv(f'Scaling laws/Data_exp_1_{exp_num}_dim.csv', index=False)

# Additional experiments: Large depth

In [None]:
""" Common hyper-parameters. """
t.manual_seed(seed)

# Model parameters.
N = 50
nb_layers = 5 # Depth of the network
nb_head = 1
n_gram = 3
context_window = n_gram

# Distribution parameters.
alphas = [1., 1., 1.]
nb_tokens=[N, N, 1]
pi = power_unif_law(alphas, nb_tokens, N)

# Training parameters.
batch_size=2**10
num_batch=1000
lr=5e-4
epochs=10
Data = generate_data(batch_size=batch_size, num_batch=num_batch, pi=pi, context_window=context_window)

In [None]:
""" Experiment 1. Scaling laws on H with fixed d=d_head. """
t.manual_seed(seed)

# Scaling parameters
d = 10
d_head=d 
para_list=[1, 6, 11, 16, 21]

mean_accuracy=[]
N_list=[]
d_list=[]
d_head_list=[]

for para in tqdm(para_list):
    accuracy = 0.

    for _ in range(repetition):
        model = AoT(d, N, nb_layers, para, d_head, nb_head, context_window, pi)

        dict = train(model, Data, epochs, lr=lr, next_token=True)
        acc = sum(dict['Acc'][-101:-1])/100
            
        accuracy += acc

    mean_accuracy.append(accuracy/repetition)
    N_list.append(N)
    d_list.append(d)
    d_head_list.append(d_head)

results = {
    'acc': mean_accuracy,
    'para': para_list,
    'N': N_list,
    'd': d_list,
    'd_head': d_head_list,
}

# We save the results as a dataframe.
data = pd.DataFrame(results)
data.to_csv(f'Scaling laws/Data_exp_1_{7}_depth.csv', index=False)

In [None]:
""" Experiment 2. Scaling laws on d_head, with d!=d_head and H (=para) fixed. """
t.manual_seed(seed)

# Model parameters.
d=10
para=21
d_head_list=[1, 3, 5, 7, 10]

# Scaling parameters
mean_accuracy = []
para_list = []
N_list = []
d_list = []
for d_head in tqdm(d_head_list):
    accuracy = 0.

    for _ in range(repetition):
        model = AoT(d, N, nb_layers, para, d_head, nb_head, context_window, pi)

        dict = train(model, Data, epochs, lr=lr, next_token=True)
        acc = sum(dict['Acc'][-101:-1])/100
        
        accuracy += acc

    mean_accuracy.append(accuracy/repetition)
    N_list.append(N)
    d_list.append(d)
    para_list.append(para)

results = {
    'acc': mean_accuracy,
    'para': para_list,
    'N': N_list,
    'd': d_list,
    'd_head': d_head_list,
}

# We save the results as a dataframe.
data = pd.DataFrame(results)
data.to_csv(f'Scaling laws/Data_exp_2_depth.csv', index=False)

In [None]:
""" Experiment 5. Scaling laws on the width of Transformer using MLPs. """
t.manual_seed(seed)

for d, exp_num in zip([7, 10, 13], [4, 7, 10]):
    para = 1
    d_head = d
    min_width = 2*d*(1-1)
    max_width = 2*d*(21-1)
    step = 2*d*5

    mean_accuracy = []
    para_list = []
    N_list = []
    d_list = []
    d_head_list = []
    width_list = []
    for width in tqdm(range(min_width, max_width+1, step)):
        accuracy = 0.

        for _ in range(repetition):
            model = Transformer(d, N, nb_layers, width, para, d_head, nb_head, context_window, pi)

            dict = train(model, Data, epochs, lr=lr, next_token=True)
            acc = sum(dict['Acc'][-101:-1])/100
            
            accuracy += acc

        mean_accuracy.append(accuracy/repetition)
        N_list.append(N)
        d_list.append(d)
        d_head_list.append(d_head)
        para_list.append(para)
        width_list.append(width)

    results = {
        'acc': mean_accuracy,
        'para': para_list,
        'N': N_list,
        'd': d_list,
        'd_head': d_head_list,
        'width': width_list,
    }

    # We save the results as a dataframe.
    data = pd.DataFrame(results)
    data.to_csv(f'Scaling laws/Data_exp_5_{exp_num}_depth.csv', index=False)


for d, exp_num in zip([7, 13], [4, 10]):
    d_head = d
    min_para = 1
    max_para = 21
    step = 5

    mean_accuracy = []
    para_list = []
    N_list = []
    d_list = []
    d_head_list = []
    width_list = []
    for para in tqdm(range(min_para, max_para+1, step)):
        accuracy = 0.

        for _ in range(repetition):
            model = AoT(d, N, nb_layers, para, d_head, nb_head, context_window, pi)

            dict = train(model, Data, epochs, lr=lr, next_token=True)
            acc = sum(dict['Acc'][-101:-1])/100
            
            accuracy += acc

        mean_accuracy.append(accuracy/repetition)
        N_list.append(N)
        d_list.append(d)
        d_head_list.append(d_head)
        para_list.append(para)
        width_list.append(width)

    results = {
        'acc': mean_accuracy,
        'para': para_list,
        'N': N_list,
        'd': d_list,
        'd_head': d_head_list,
        'width': width_list,
    }

    # We save the results as a dataframe.
    data = pd.DataFrame(results)
    data.to_csv(f'Scaling laws/Data_exp_1_{exp_num}_depth.csv', index=False)