In [1]:
# from datasets import load_dataset
import pandas as pd
from tqdm import tqdm
lang = "py"
run_lang = "python"

In [None]:
import wandb

api = wandb.Api()
entity, project = "loubnabnl", "scaling_laws"  # set to your entity and project 
runs = api.runs(entity + "/" + project) 

summary_list, config_list, name_list = [], [], []
for run in runs: 
    # .summary contains the output keys/values for metrics like accuracy.
    #  We call ._json_dict to omit large files 
    summary_list.append(run.summary._json_dict)

    # .config contains the hyperparameters.
    #  We remove special values that start with _.
    config_list.append(
        {k: v for k,v in run.config.items()
         if not k.startswith('_')})

    # .name is the human-readable name of the run.
    name_list.append(run.name)

orig_runs_df = pd.DataFrame({
    "summary": summary_list,
    "config": config_list,
    "name": name_list
    })

def expand_dict_to_columns(df, col):
    return pd.concat([df.drop([col], axis=1),  pd.json_normalize(df[col])], axis=1)

run_df = expand_dict_to_columns(orig_runs_df, "summary")
run_df = expand_dict_to_columns(run_df, "config")

keep_cols = [c for c in run_df if "lm-loss-validation/TEST_" not in c]
run_df = run_df[keep_cols]

df_run = run_df[run_df["_step"] >= .99 * run_df["train_iters"]]
df_run.head(2)

In [None]:
scores = load_dataset("nuprl/pass_k_with_MultiPL-E", split="train", use_auth_token=True, revision="bigcode_scaling_laws")

scores = scores.map(lambda x: {"idx": int(x["Experiment"].split("idx_")[1].split("-")[0])})
scores = scores.map(lambda x: {"lang": x["Experiment"].split("humaneval-")[1].split("-")[0]})

scores = scores.filter(lambda x: x["k"]==100)
scores = scores.filter(lambda x: x["lang"]==lang)

df_scores = scores.to_pandas()

In [19]:
DEFAULT_PROJECT_DIR = "/fsx-onellm/margaretli/env_srcs/xlf/xlformers_n/scaling/data"

col_names = ['C', 'D', 'N', 'lr', 'Avg Train Loss', 'Max Train Loss', 'C4 Eval PPL', 'Wiki Eval PPL', 'C4 Eval Loss', 'Wiki Eval Loss']
# col_names = ['C', 'N', 'loss']

def read_data(csv_file, loss_name='C4 Eval Loss', col_names=col_names):
    mins_only = []
    df = pd.read_csv(csv_file,)
    df.dropna(subset=[loss_name], inplace=True, use_cols=col_names)

    if 'lr' in df.columns:
        df = df.loc[(df['lr'] >= 0)]
    if 'D' not in df.columns:
        df['D'] = df['C'] / (df['N'] * 6)

    n_vals = df['N'].unique()
    d_vals = sorted(df['D'].unique())
    for n in n_vals:
        for d in d_vals:
            cd_df = df[(df['N'] == n) & (df['D'] == d)]
            if cd_df.empty:
                continue
            min_index = cd_df[loss_name].idxmin()
            mins_only.append(cd_df.loc[min_index])

    mins_only_df = pd.DataFrame(mins_only)
    print(mins_only_df)
    return mins_only_df

In [14]:
import torch
import numpy as np

def scaling_law(N, D, params):
    a, b, e, alpha, beta = params
    A = np.exp(a)
    B = np.exp(b)
    E = np.exp(e)
    
    L = E + (A / (N**alpha)) + (B /(D**beta))
    
    return L

def opt_N_D(C, G, opt_a, opt_b):
    opt_N = G*(C/6)**opt_a
    opt_D = (1/G)*(C/6)**opt_b
    return opt_N, opt_D

def loss(inp, params, loss_type='huber'):
    a, b, e, alpha, beta = params[0], params[1], params[2], params[3], params[4]
    pre_lse = torch.stack([a - alpha*torch.log(inp[:, 0]), b - beta*torch.log(inp[:, 1]), e.expand((inp.shape[0]))])
    post_lse = torch.logsumexp(pre_lse, dim=0)
    if loss_type == 'huber':
        # huber_loss = torch.nn.functional.huber_loss(post_lse, torch.log(inp[:, 2]), delta=1e-3, reduction='none')
        loss = torch.nn.functional.huber_loss(post_lse, torch.log(inp[:, 2]), delta=10, reduction='none')
    elif loss_type == 'mse':
        loss = torch.nn.functional.mse_loss(post_lse, torch.log(inp[:, 2]), reduction='none')
    elif loss_type == 'l1':
        loss = torch.nn.functional.l1_loss(post_lse, torch.log(inp[:, 2]), reduction='none')
    elif loss_type == 'smooth_l1':
        loss = torch.nn.functional.smooth_l1_loss(post_lse, torch.log(inp[:, 2]), reduction='none')
    elif loss_type == 'kl':
        loss = torch.nn.functional.kl_div(post_lse, torch.log(inp[:, 2]), reduction='none')
    elif loss_type == 'cosine':
        loss = torch.nn.functional.cosine_similarity(post_lse, torch.log(inp[:, 2]), reduction='none')
    elif loss_type == 'poisson':
        loss = torch.nn.functional.poisson_nll_loss(post_lse, torch.log(inp[:, 2]), reduction='none')
    elif loss_type == 'nll':
        loss = torch.nn.functional.nll_loss(post_lse, torch.log(inp[:, 2]), reduction='none')
    
    return loss.sum()

def minimize_loss(inp, init_params=[6, 6, -1, 0.28, 0.32], steps=500, algorithm="lbfgs", lr=1e-2):
    params = torch.nn.Parameter(data=torch.Tensor(init_params))
    
    if algorithm == "lbfgs":
        opt = torch.optim.LBFGS([params],
                    lr=lr,
                    history_size=10, 
                    max_iter=20, 
                    line_search_fn="strong_wolfe")
    elif algorithm == "adam":
        opt = torch.optim.Adam([params], lr=lr)
    elif algorithm == "sgd":    
        opt = torch.optim.SGD([params], lr=lr)
    elif algorithm == "adagrad":
        opt = torch.optim.Adagrad([params], lr=lr)
    elif algorithm == "adadelta":
        opt = torch.optim.Adadelta([params], lr=lr)
    elif algorithm == "rmsprop":
        opt = torch.optim.RMSprop([params], lr=lr)
    elif algorithm == "rprop":
        opt = torch.optim.Rprop([params], lr=lr)
    elif algorithm == "adamw":
        opt = torch.optim.AdamW([params], lr=lr)
    elif algorithm == "sparseadam":
        opt = torch.optim.SparseAdam([params], lr=lr)
    elif algorithm == "adamax":
        opt = torch.optim.Adamax([params], lr=lr)
    elif algorithm == "asgd":
        opt = torch.optim.ASGD([params], lr=lr)
    else:
        raise ValueError("Invalid algorithm")

    def closure():
        opt.zero_grad()
        l = loss(inp, params)
        l.backward()
        return l

    for i in range(steps):
        l = opt.step(closure)
    return l, params

In [36]:
loss_name = "C4 Eval Loss"
csv_file = "/fsx-onellm/margaretli/env_srcs/xlf/xlformers_n/scaling/data/dense.csv"
fit_df = read_data(csv_file=csv_file, loss_name=loss_name)
# loss_name = "loss"
# csv_file = "/fsx-onellm/margaretli/env_srcs/xlf/xlformers_n/scaling/data/epoch_ai.csv"
# fit_df = read_data(csv_file=csv_file, loss_name=loss_name, col_names=None)

project_dir=DEFAULT_PROJECT_DIR

# fit_df = fit_df[fit_df['N'] < 100000000]

inp = torch.Tensor([[N, D, L] for N, D, L in 
                    zip(fit_df["N"], fit_df["D"], fit_df[loss_name])])
inp.require_grad = True
steps = 20000
lr = 1e-2

all_algorithms = ["lbfgs", "adam", "adagrad", "rmsprop", "rprop", "adamw", "adamax"]
# all_algorithms = ["lbfgs", "adam", "sgd", "adagrad", "adadelta", "rmsprop", "rprop", "adamw", "adamax", "asgd"]
all_best_params = {}

min_all_loss = 1e10
best_algorithm = ''

for algorithm in all_algorithms:
    print("Algorithm: ", algorithm)

    min_loss = 1e10
    # for a in tqdm(np.linspace(0, 10, 5)):
    #     for b in np.linspace(0, 10, 5):
    #         for e in np.linspace(-1, 2, 4):
    #             for alpha in np.linspace(0, 1, 4):
    #                 for beta in np.linspace(0, 1, 4):
    #                     l, params = minimize_loss(inp, [a, b, e, alpha, beta], algorithm=algorithm, steps=steps, lr=lr)
    #                     if l < min_loss:
    #                         min_loss = l
    #                         best_params = params.detach().numpy()

    for a in tqdm([5]):
        for b in [10]:
            for e in [0.5]:
                for alpha in [0.5]:
                    for beta in [0.5]:
                        l, params = minimize_loss(inp, [a, b, e, alpha, beta], algorithm=algorithm, steps=steps, lr=lr)
                        if l < min_loss:
                            min_loss = l
                            best_params = params.detach().numpy()


    print(min_loss)
    print(best_params)
    all_best_params[algorithm] = best_params
    if min_loss < min_all_loss:
        min_all_loss = min_loss
        best_algorithm = algorithm
    
print("Best Algorithm: ", best_algorithm)
print("Best Params: ", all_best_params[best_algorithm])

     Params  total steps      lr  Avg Train Loss  Max Train Loss  C4 Eval PPL  \
4       NaN          NaN  0.0040           4.957           5.185      146.194   
12      NaN          NaN  0.0040           4.717           5.044      119.259   
17      NaN          NaN  0.0040           4.453           4.659       89.148   
21      NaN          NaN  0.0040           3.999           4.218       61.665   
25      NaN          NaN  0.0040           3.736           4.058       46.984   
..      ...          ...     ...             ...             ...          ...   
268     NaN          NaN  0.0020           2.899           3.182       21.522   
272     NaN          NaN  0.0020           2.864           3.263       20.629   
276     NaN          NaN  0.0020           2.815           3.157       19.575   
286     NaN          NaN  0.0010           2.725           3.060       17.837   
295     NaN          NaN  0.0008           2.487           2.915       14.389   

     Wiki Eval PPL  C4 Eval

  0%|          | 0/1 [00:00<?, ?it/s]

100%|██████████| 1/1 [00:34<00:00, 34.53s/it]


tensor(0.0234, grad_fn=<SumBackward0>)
[5.011801   9.981039   0.73496526 0.32206574 0.48285556]
Algorithm:  adam


100%|██████████| 1/1 [00:04<00:00,  4.81s/it]


tensor(0.0129, grad_fn=<SumBackward0>)
[ 3.445105  14.621374   0.7848881  0.2189644  0.7264511]
Algorithm:  adagrad


100%|██████████| 1/1 [00:04<00:00,  4.70s/it]


tensor(0.0230, grad_fn=<SumBackward0>)
[ 4.892044   10.048826    0.7287354   0.3132959   0.48643562]
Algorithm:  rmsprop


100%|██████████| 1/1 [00:04<00:00,  4.44s/it]


tensor(0.0541, grad_fn=<SumBackward0>)
[ 7.169803   12.82391     0.9486355   0.48422384  0.6281711 ]
Algorithm:  rprop


100%|██████████| 1/1 [00:05<00:00,  5.26s/it]


tensor(0.0116, grad_fn=<SumBackward0>)
[ 4.262972   14.428007    0.8462173   0.27863997  0.7154747 ]
Algorithm:  adamw


100%|██████████| 1/1 [00:04<00:00,  4.89s/it]


tensor(0.0299, grad_fn=<SumBackward0>)
[ 2.2191088   8.97714    -0.42929128  0.09013393  0.429927  ]
Algorithm:  adamax


100%|██████████| 1/1 [00:05<00:00,  5.02s/it]

tensor(0.0113, grad_fn=<SumBackward0>)
[ 3.2709134  14.612422    0.76042837  0.20365712  0.7250983 ]
Best Algorithm:  adamax
Best Params:  [ 3.2709134  14.612422    0.76042837  0.20365712  0.7250983 ]





In [35]:
# all_best_params = {
#     "lbfgs": [6.291975, 12.452576, 0.90624857, 0.45291403, 0.60690635],
#     "adam": [10.057474, 11.168409, 0.8845671, 0.7009503, 0.5374865],
#     "sgd": [-44.88375, -8.306413, 41.089928, 867.93774, 228.67094],
#     "adagrad": [7.6266613, 10.241391, 0.82940316, 0.556825, 0.48704857],
#     "adadelta": [0.31501442, 10.162561, 0.13301994, 0.01270814, 0.47626096],
#     "rmsprop": [ 0.3898864, 10.235114, 0.95406806, 0.61034465, 0.4942382],
#     "rprop": [5.42916, 10.889063, 0.83016324, 0.39976725, 0.5223741],
#     "adamw": [1.6131105, 6.284966, 0.02174136, 0.13462491, 0.2669011],
#     "adamax": [9.979901, 10.904844, 0.870543, 0.69669473, 0.52316064],
#     "asgd": [-1.2160281e-01, -1.3072298e+01, 4.0904160e+01, 2.1169136e+00, 3.7313159e+02],
# }

for algorithm, best_params in all_best_params.items():
    opt_alpha = best_params[-2]
    opt_beta = best_params[-1]

    opt_a =  opt_beta / (opt_alpha+opt_beta)
    opt_b =  opt_alpha / (opt_alpha+opt_beta)

    A = np.exp(best_params[0])
    B = np.exp(best_params[1])
    G = ((opt_alpha*A)/(opt_beta*B))**(1/(opt_alpha+opt_beta))

    scaling = []

    for C in [1.25E+18, 5.01E+18, 1.98E+19, 1E21, 1E23]:
        N, D = opt_N_D(C, G, opt_a, opt_b)
        scaling.append(
            {"compute": f"{C:e}",
            "parameters (B)": f"{N/1e9:.2f}",
            "tokens (B)": f"{D/1e9:.2f}",
            "ratio": f"{D/N:.2f}",
            }
        )
    
    print("Algorithm: ", algorithm)
    print("Scaling: ", pd.DataFrame(scaling))
        



Algorithm:  lbfgs
Scaling:          compute parameters (B) tokens (B)    ratio
0  1.250000e+18           0.01      20.61  2039.66
1  5.010000e+18           0.02      38.04  1733.26
2  1.980000e+19           0.05      69.78  1475.32
3  1.000000e+21           0.42     394.01   931.48
4  1.000000e+23           5.54    3007.90   542.85
Algorithm:  adam
Scaling:          compute parameters (B) tokens (B)   ratio
0  1.250000e+18           0.02      13.61  888.80
1  5.010000e+18           0.04      22.69  616.45
2  1.980000e+19           0.09      37.63  429.14
3  1.000000e+21           1.04     159.50  152.64
4  1.000000e+23          19.17     869.36   45.35
Algorithm:  adagrad
Scaling:          compute parameters (B) tokens (B)    ratio
0  1.250000e+18           0.01      20.51  2020.00
1  5.010000e+18           0.02      37.29  1665.11
2  1.980000e+19           0.05      67.37  1375.25
3  1.000000e+21           0.46     364.41   796.76
4  1.000000e+23           6.30    2644.94   419.74
Alg

In [58]:
scaling = []

for C in [1.92e19, 1.21e20, 1.23e22, 5.76e23, 3.85e24, 9.90e24, 3.43e25, 1.27e26, 1.30e28]:
    N, D = opt_N_D(C, G, opt_a, opt_b)
    scaling.append(
        {"compute": f"{C:e}",
         "parameters (B)": f"{N/1e9:.2f}",
         "tokens (B)": f"{D/1e9:.2f}",
        }
    )
     
pd.DataFrame(scaling)

Unnamed: 0,compute,parameters (B),tokens (B)
0,1.92e+19,0.18,18.13
1,1.21e+20,1.0,20.21
2,1.23e+22,77.25,26.54
3,5.76e+23,2883.82,33.29
4,3.85e+24,17234.12,37.23
5,9.9e+24,41917.26,39.36
6,3.43e+25,134974.2,42.35
7,1.27e+26,462657.1,45.75
8,1.3e+28,36053325.81,60.1


In [47]:

scaling_law(1e10, 100e9, best_params)

2.2020082572609923

In [60]:
import plotly.express as px
import plotly.graph_objects as go

color_map={
           "3e+18": "orange",
           "6e+18": "black",
           "3e+18": "brown",
           "1e+19": "green",
           "3e+19": "purple",
           "6e+19": "red",
           "1e+20": "blue",
           "3e+20": "pink",
           "6e+20": "gold",
           "1e+21": "silver",
          }
fit_df.head(4)

# fit_df['C'] = fit_df['C'].astype(str)
fig = px.scatter(fit_df, x='N', y=loss_name, color='C', 
                 log_x=True, color_discrete_map=color_map)

for compute in color_map.keys():
    tmp_df = fit_df[fit_df['C'] == compute]
    df_d = list() 
    for _, row in tmp_df.iterrows():
        pred = scaling_law(row['N'], row['D'], best_params)
        df_d.append({'prediction': pred, 'N': row['N'], 'D': row['D']})
    fig2 = px.line(pd.DataFrame(df_d).sort_values('N'), 
                   x='N', y='prediction', log_x=True)
    fig2.update_traces(line_color=color_map[compute], line_width=2)
    fig = go.Figure(data=fig.data + fig2.data)

fig.show()

KeyError: 'N'