In [2]:
from core import *
from database import *
from simulator import *
from visualizer import *
import pandas as pd
import random
from _metrics import asymmetric_mae, huber, log_huber


In [None]:
param_grid = dict(
        E=np.linspace(1, 5, 5),
        A=np.linspace(100, 4000, 10),
        B=np.linspace(100, 4000, 10),
        alpha=np.linspace(0.05, 0.9, 10),
        beta=np.linspace(0.05, 0.9, 10),
    ),


# _const = dict(
#     log_N=np.log(_df.N.values.astype(DTYPE)),
#     log_D=np.log(_df.D.values.astype(DTYPE)),
#     # y_true=_df.loss.values.astype(DTYPE),
#     y_true=_df[loss_name].values.astype(DTYPE),
#     weights=weights,
# )

# The absolute value range affects the differential optimization
_autoscale_range = np.array(list(map(np.ptp, param_grid.values())))
# In case of any axis with a single initial value:
_autoscale_range[_autoscale_range == 0] = 1.0

if parallel:
    # GLobal declarations for `multiprocessing`
    global initial_guesses
    global _optimize_params

initial_guesses = list(itertools.product(*param_grid.values()))
initial_guesses /= _autoscale_range

print(_autoscale_range)

print(initial_guesses)


: 

In [5]:
loss_name = "C4 Eval Loss"
csv_file = "/fsx-onellm/margaretli/env_srcs/xlf/xlformers_n/scaling/data/data.csv"

col_names = ['C', 'D', 'N', 'lr', 'Avg Train Loss', 'Max Train Loss', 'C4 Eval PPL', 'Wiki Eval PPL', 'C4 Eval Loss', 'Wiki Eval Loss']

def read_data(csv_file, loss_name='C4 Eval Loss', col_names=col_names):
    mins_only = []
    df = pd.read_csv(csv_file, usecols=col_names,)
    df.dropna(subset=[loss_name], inplace=True)

    df = df.loc[(df['lr'] >= 0)]
    n_vals = df['N'].unique()
    d_vals = sorted(df['D'].unique())
    for n in n_vals:
        for d in d_vals:
            cd_df = df[(df['N'] == n) & (df['D'] == d)]
            if cd_df.empty:
                continue
            min_index = cd_df['C4 Eval Loss'].idxmin()
            # print(min_index)
            # print(cd_df)
            # print(cd_df.loc[min_index])
            mins_only.append(cd_df.loc[min_index])

    mins_only_df = pd.DataFrame(mins_only)

    print(mins_only_df)
    # df.rename(columns={})
    return mins_only_df

_df = read_data(csv_file=csv_file, loss_name=loss_name)

        lr  Avg Train Loss  Max Train Loss  C4 Eval PPL  Wiki Eval PPL  \
4    0.004           4.957           5.185      146.194        216.300   
12   0.004           4.717           5.044      119.259        166.031   
17   0.004           4.453           4.659       89.148        115.380   
21   0.004           3.999           4.218       61.665         70.939   
25   0.004           3.736           4.058       46.984         47.592   
29   0.004           3.533           3.764       39.858         35.852   
33   0.002           4.925           5.171      141.632        224.183   
37   0.002           4.683           5.024      114.945        168.517   
42   0.004           3.880           4.109       55.653         62.826   
46   0.004           3.603           3.968       41.767         39.030   
50   0.004           3.448           3.680       36.775         32.393   
54   0.004           3.384           3.679       34.108         29.042   
58   0.004           3.312           3

In [6]:
loss_fn = asymmetric_mae

FLOAT_TINY = np.finfo(np.single).tiny
FLOAT_LOGMAX = np.log(np.finfo(np.single).max)

_const = dict(
    log_N=np.log(_df.N.values.astype(DTYPE)),
    log_D=np.log(_df.D.values.astype(DTYPE)),
    # y_true=_df.loss.values.astype(DTYPE),
    y_true=_df[loss_name].values.astype(DTYPE),
)

def calc_loss(E, A, B, alpha, beta, loss_fn=loss_fn):
    # # Ensure the log scale for `a` and `b` but `E`.
    # # Inspect the user-specified initial keys requiring `exp`/`log` transformation
    # if "e" in __param_grid_keys:
    #     E = np.exp(E)
    # if "A" in __param_grid_keys:
    a = np.log(np.clip(A, FLOAT_TINY, None))
    # if "B" in __param_grid_keys:
    b = np.log(np.clip(B, FLOAT_TINY, None))

    log_term_2nd = a - alpha * _const["log_N"]
    log_term_3rd = b - beta * _const["log_D"]
    log_term_2nd = np.clip(log_term_2nd, None, FLOAT_LOGMAX)
    log_term_3rd = np.clip(log_term_3rd, None, FLOAT_LOGMAX)
    y_pred = E + np.exp(log_term_2nd) + np.exp(log_term_3rd)

    losses = loss_fn(_const["y_true"], y_pred)

    # if weight_fn:
    #     losses = losses * _const["weights"]

    return np.mean(losses)

In [25]:
# chinch_dict=dict(
#     E=1.69337368,
#     A=406.401018,
#     B=410.722827,
#     alpha=0.33917084,
#     beta=0.2849083,
# )

chinch_dict=dict(
    E=1,
    A=406.401018,
    B=410.722827,
    alpha=0.33917084,
    beta=0.2839083,
)

print(calc_loss(**chinch_dict))


found_params_dict=dict(
    E=2.018,
    A=42.89,
    B=33920,
    alpha=0.2608,
    beta=0.5013,
)

print(calc_loss(**found_params_dict))



0.40568422569214721568
0.08190579904408484987


In [None]:
def chinchilla_flops_per_tok(seq_len, vocab_size, d_model, num_heads, num_layers, ffw_size):
    """ 
    Calculate total number of FLOPs, see Chinchilla 
    paper Appendix F as reference: https://arxiv.org/pdf/2203.15556.pdf
    """ 
    key_size = d_model // num_heads

    # embeddings
    embeddings = 2 * vocab_size * d_model

    # attention
    # key, query, value projections
    attention = 2 * 3 * d_model * (key_size * num_heads)
    # key @ query logits
    attlogits = 2 * seq_len * (key_size * num_heads)
    # softmax
    attsoftmax = 3 * num_heads * seq_len# 3* is for subtract (max), exp, divide (?)
    # softmax @ value reductions
    attvalue = 2 * seq_len * (key_size * num_heads)
    # final linear
    attlinear = 2 * (key_size * num_heads) * d_model
    att = attention + attlogits + attsoftmax + attvalue + attlinear
    # feed forward
    dense = 2 * (d_model * ffw_size + d_model * ffw_size)

    # logits
    logits = 2 * d_model * vocab_size
    
    # this is what you'd expect:
    # forward_flops = embeddings + num_layers * (att + dense) + logits
    # but:
    # per author correspondence apparently there is typo in the paper,
    # they do not count embeddings and logits to repro table 4. So instead:
    forward_flops = num_layers * (att + dense)
    backward_flops = 2 * forward_flops # as in Kaplan et al. 2020
    total_flops = forward_flops + backward_flops

    return total_flops