In [1]:
def average_values(dictionary):
    sum = 0
    for i, v in enumerate(list(dictionary.values())):
        sum += v
    return sum / (i + 1)

In [2]:
keys = [
    'ai2_arc_ARC-Challenge/acc',
    'ai2_arc_ARC-Easy/acc',
    'hellaswag/acc',
    'math_qa/acc',
    'piqa/acc',
    'pubmed_qa/acc',
    'super_glue_copa/acc',
    'winogrande_winogrande_s/acc',
]

In [3]:
import wandb

WANDB_PROJECT = "Gradient Ascent Limit Test"

api = wandb.Api(timeout=100)
all_runs = [run for run in api.runs(WANDB_PROJECT)]
print(f"{len(all_runs)} runs found")

1749 runs found


In [4]:
from tqdm import tqdm
valid_runs = []
for run in all_runs:
    if set(keys).issubset(list(run.summary.keys())):
        valid_runs.append(run)
    else:
        tqdm.write(f"{run.name} lacks the proper summary keys, skipping...")

OPT-1.3B-Unlike.-CC_0 lacks the proper summary keys, skipping...
OPT-2.7B-Epoch_-CC_39 lacks the proper summary keys, skipping...
OPT-2.7B-Epoch_-CC_39 lacks the proper summary keys, skipping...
OPT-2.7B-Epoch_-CC_58 lacks the proper summary keys, skipping...
OPT-2.7B-Epoch_-CC_94 lacks the proper summary keys, skipping...
OPT-2.7B-Epoch_-CC_39 lacks the proper summary keys, skipping...
OPT-1.3B-Github_57-Unlike. lacks the proper summary keys, skipping...
OPT-1.3B-Extraction_52-Unlike. lacks the proper summary keys, skipping...
OPT-1.3B-CC_61-Unlike. lacks the proper summary keys, skipping...
OPT-1.3B-Github_47-Unlike. lacks the proper summary keys, skipping...
OPT-1.3B-Github_40-Unlike. lacks the proper summary keys, skipping...
OPT-1.3B-CC_61-Unlike. lacks the proper summary keys, skipping...
OPT-1.3B-Github_57-Unlike. lacks the proper summary keys, skipping...
OPT-1.3B-Github_47-Unlike. lacks the proper summary keys, skipping...
OPT-1.3B-Extraction_52-Unlike. lacks the proper summar

In [5]:
runs_1b = [r for r in valid_runs if r.config['model_name_or_path'] == 'facebook/opt-1.3b']
len(runs_1b)

724

In [6]:
runs_350m = [r for r in valid_runs if r.config['model_name_or_path'] == 'facebook/opt-350m']
len(runs_350m)

694

In [7]:
runs_3b = [r for r in valid_runs if r.config['model_name_or_path'] == 'facebook/opt-2.7b']
len(runs_3b)

305

In [8]:
import random
random.seed(42)

def sample_with_corpora(runs):
  corpora = {'extraction': [], 'cc': [], 'github': []}
  for r in runs:
    if r.config['train_batch_size'] == 1:
      if 'extraction' in r.config['train_set']:
        corpora['extraction'].append(r)
      elif 'cc' in r.config['train_set']:
        corpora['cc'].append(r)
      elif 'github' in r.config['train_set']:
        corpora['github'].append(r)
      else:
        raise KeyError('')
  for k, v in corpora.items():
    try:
      corpora[k] = random.sample(corpora[k], 100)
    except ValueError:
        pass
  print(len(corpora['extraction']))
  print(len(corpora['cc']))
  print(len(corpora['github']))

  
  final = corpora['extraction'] + corpora['cc'] + corpora['github']
  # assert len(final) == 300
  return final

In [9]:
runs_1b = sample_with_corpora(runs_1b)
runs_350m = sample_with_corpora(runs_350m)
runs_3b = sample_with_corpora(runs_3b)

100
94
100
100
100
100
100
100
100


In [10]:
print(len(runs_1b), len(runs_350m), len(runs_3b))

294 300 300


In [11]:
def calc_best_for_all_runs(runs):
  best_average_all = []
  best_non_average_all = []
  best_epoch_all = []
  corpora = []
  for run in tqdm(runs):
    data = [row for row in run.scan_history(keys=keys)]
    averages = [average_values(d) for d in data] # length == total_epochs
    best_average = max(averages)
    best_epoch = averages.index(best_average)
    best_average_all.append(best_average)
    best_non_average_all.append(data[best_epoch])
    best_epoch_all.append(best_epoch)
    if 'extraction' in run.config['train_set']:
      corpora.append('TDEC')
    elif 'cc' in run.config['train_set']:
      corpora.append('CC')
    elif 'github' in run.config['train_set']:
      corpora.append('Git.')
    else:
      raise KeyError('')
  return best_average_all, best_non_average_all, best_epoch_all, corpora

In [12]:
average_1b, non_average_1b, epoch_1b, c_1b = calc_best_for_all_runs(runs_1b)
average_350m, non_average_350m, epoch_350m, c_350m = calc_best_for_all_runs(runs_350m)
average_3b, non_average_3b, epoch_3b, c_3b = calc_best_for_all_runs(runs_3b)

100%|██████████| 294/294 [05:15<00:00,  1.07s/it]
100%|██████████| 300/300 [05:25<00:00,  1.09s/it]
100%|██████████| 300/300 [05:26<00:00,  1.09s/it]


In [13]:
import numpy as np
import pandas as pd
print(np.mean(average_1b))
print(np.median(average_1b))

0.5096514680023704
0.5091518862172961


In [14]:
df_1b = pd.DataFrame({'value': average_1b, 'corpora': c_1b})
df_1b_non_averaged = pd.DataFrame(non_average_1b)
df_1b = pd.concat([df_1b, df_1b_non_averaged], axis=1)
df_3b = pd.DataFrame({'value': average_3b, 'corpora': c_3b})
df_3b_non_averaged = pd.DataFrame(non_average_3b)
df_3b = pd.concat([df_3b, df_3b_non_averaged], axis=1)
df_350m = pd.DataFrame({'value': average_350m, 'corpora': c_350m})
df_350m_non_averaged = pd.DataFrame(non_average_350m)
df_350m = pd.concat([df_350m, df_350m_non_averaged], axis=1)
df_1b['model'] = '1.3B'
df_3b['model'] = '2.7B'
df_350m['model'] = '350M'
df_melt = pd.concat([df_350m, df_1b, df_3b])
df_melt.head(5)

Unnamed: 0,value,corpora,ai2_arc_ARC-Challenge/acc,ai2_arc_ARC-Easy/acc,hellaswag/acc,math_qa/acc,piqa/acc,pubmed_qa/acc,super_glue_copa/acc,winogrande_winogrande_s/acc,model
0,0.459342,TDEC,0.19661,0.45,0.3875,0.203125,0.665625,0.521875,0.7,0.55,350M
1,0.457722,TDEC,0.20678,0.45625,0.365625,0.196875,0.68125,0.540625,0.68,0.534375,350M
2,0.457211,TDEC,0.186441,0.45625,0.359375,0.21875,0.675,0.540625,0.69,0.53125,350M
3,0.457211,TDEC,0.186441,0.45625,0.359375,0.21875,0.675,0.540625,0.69,0.53125,350M
4,0.457211,TDEC,0.186441,0.45625,0.359375,0.21875,0.675,0.540625,0.69,0.53125,350M


In [16]:
df_melt.to_csv('../full_results/classification.csv', index=False)