In [1]:
import os
import shutil
import sys
import json
import pandas as pd
import subprocess
from pathlib import Path
from IPython.display import display, Latex
from litgpt.scripts.convert_pretrained_checkpoint import convert_pretrained_checkpoint

sys.path.append(str(Path("src", "litgpt", "eval").resolve()))

# from lm_eval_harness import run_eval_harness

## Locate the checkpoint

Below we define the path to the checkpoint file saved by the training script. You can include the path to the job, e.g., `/teamspace/jobs/MY_JOB_NAME/share/results/CHECKPOINT_NAME.pth`.

In [7]:
pretrained_checkpoint_dir = Path("results/final")


## Convert the checkpoint if needed

The checkpoint saved by the training script contains the weights, but the evaluation harness will also require the model config and tokenizer config to generate the outputs. We can take these from the checkpoint folder of the initial base model and store them in one folder which we then pass to the evaluation harness.

In [9]:
# Where the converted checkpoint should be saved
converted_checkpoint_dir = Path("results/export", pretrained_checkpoint_dir.name)

if not converted_checkpoint_dir.exists():
    convert_pretrained_checkpoint(checkpoint_dir=pretrained_checkpoint_dir, output_dir=converted_checkpoint_dir)


Processing results/final/lit_model.pth
Saving converted checkpoint to results/export/final/lit_model.pth.


## Generate samples from the checkpoint

In [10]:
prompt = "Examination"
temperature = 0.8
max_new_tokens = 256
seed = 1234


out = subprocess.check_output(
    [
        "litgpt", "generate", "base", 
        "--checkpoint_dir", converted_checkpoint_dir, 
        "--prompt", prompt, 
        "--max_new_tokens", str(max_new_tokens),
        "--temperature", str(temperature),
        "--seed", str(seed),
    ],
    text=True,
)

print("\n\n")
display(Latex(out))
# print(out)


Loading model 'results/export/final/lit_model.pth' with {'name': 'tiny-llama-1.1b', 'hf_config': {'name': 'TinyLlama-1.1B-intermediate-step-1431k-3T', 'org': 'TinyLlama'}, 'scale_embeddings': False, 'block_size': 2048, 'vocab_size': 32000, 'padding_multiple': 64, 'padded_vocab_size': 32000, 'n_layer': 22, 'n_head': 32, 'head_size': 64, 'n_embd': 2048, 'rotary_percentage': 1.0, 'parallel_residual': False, 'bias': False, 'lm_head_bias': False, 'n_query_groups': 4, 'shared_attention_norm': False, 'norm_class_name': 'RMSNorm', 'norm_eps': 1e-05, 'mlp_class_name': 'LLaMAMLP', 'gelu_approximate': 'none', 'intermediate_size': 5632, 'rope_condense_ratio': 1, 'rope_base': 10000, 'n_expert': 0, 'n_expert_per_token': 0, 'rope_n_elem': 64}
Time to instantiate model: 0.09 seconds.
Time to load the model weights: 11.47 seconds.
Seed set to 1234
Time for inference 1: 5.17 sec total, 49.50 tokens/sec
Memory used: 2.28 GB







<IPython.core.display.Latex object>

In [None]:
prompt = """In the following radiology report, classify the patient's current microcalcification status as Positive, Negative or Not Stated. BILATERAL SCREENING MAMMOGRAPHY.
History: Screening.
Comparison available dating from 01/2023.
Findings:
There are scattered fibroglandular densities bilaterally. No skin thickening or nipple retraction is seen. No grouped calcifications are identified. No spiculated or circumscribed masses are seen.
IMPRESSION:
No mammographic evidence of malignancy. BI-RADS Category 1."""
temperature = 0.8
max_new_tokens = 10
seed = 1234


out = subprocess.check_output(
    [
        "litgpt", "generate", "base", 
        "--checkpoint_dir", converted_checkpoint_dir, 
        "--prompt", prompt, 
        "--max_new_tokens", str(max_new_tokens),
        "--temperature", str(temperature),
        "--seed", str(seed),
    ],
    text=True,
)

print("\n\n")
display(Latex(out))
# print(out)

## Run the evaluation harness

This will download the benchmark datasets and use the GPU to run the evaluation on the selected tasks. Make sure you don't already have something running on the GPU, otherwise you may run out of memory. Depending on the selected tasks, this will take between ~20 minutes and 1.5 hours to finish (see progress bar).

In [25]:
# # Where the outputs of the eval harness will be saved
# eval_dir = Path("results/evaluation", converted_checkpoint_dir.name)

# # Where the results will be saved
# results_file = eval_dir / f"{'-'.join(eval_tasks)}.json"

# # The names of the tasks to evaluate over
# eval_tasks = [
#     "hellaswag",
#     "openbookqa",
#     "winogrande", 
#     "arc_easy",
#     "arc_challenge", 
#     "boolq", 
#     "piqa",
#     # "gsm8k",
# ]

# run_eval_harness(
#     checkpoint_dir=converted_checkpoint_dir,
#     eval_tasks=eval_tasks,
#     save_filepath=results_file,
#     # If you want to do a "quick" run on a subset of the datasets, set a number here
#     limit=None,
# )

Loading model 'results/export/final.pth/lit_model.pth' with {'name': 'tiny-llama-1.1b', 'hf_config': {'org': 'TinyLlama', 'name': 'TinyLlama-1.1B-intermediate-step-955k-token-2T'}, 'block_size': 2048, 'vocab_size': 32000, 'padding_multiple': 64, 'padded_vocab_size': 32000, 'n_layer': 22, 'n_head': 32, 'n_embd': 2048, 'rotary_percentage': 1.0, 'parallel_residual': False, 'bias': False, 'lm_head_bias': False, 'n_query_groups': 4, 'shared_attention_norm': False, '_norm_class': 'RMSNorm', 'norm_eps': 1e-05, '_mlp_class': 'LLaMAMLP', 'gelu_approximate': 'none', 'intermediate_size': 5632, 'rope_condense_ratio': 1, 'rope_base': 10000, 'n_expert': 0, 'n_expert_per_token': 0, 'head_size': 64, 'rope_n_elem': 64}


Found tasks: ['piqa', 'boolq']
Task: piqa; number of docs: 1838
Task: piqa; document 0; context prompt (starting on next line):
Question: Remove seeds from  strawberries
Answer:
(end of prompt on previous line)
Requests: [Req_loglikelihood('Question: Remove seeds from  strawberries\nAnswer:', ' Blend the strawberries, pour the mixture through a fine-mesh strainer with a bowl underneath to catch the pulps and strain out the seeds')[0]
, Req_loglikelihood('Question: Remove seeds from  strawberries\nAnswer:', ' Chop up the strawberries, pour the mixture through a fine-mesh strainer with a bowl underneath to catch the pulps and strain out the seeds')[0]
]
Task: boolq; number of docs: 3270
Task: boolq; document 0; context prompt (starting on next line):
NCIS: New Orleans (season 4) -- The fourth season of NCIS: New Orleans premiered on September 26, 2017 on CBS. The series continues to air following Bull, Tuesday at 10:00 p.m. (ET) and contained 24 episodes. The season concluded on May 15, 

100%|██████████| 10216/10216 [04:03<00:00, 41.95it/s]

Saving results to 'results/evaluation/final.pth/hellaswag-openbookqa-winogrande-arc_easy-arc_challenge-boolq-piqa.json'





## Results

Here we load the JSON results file that was saved by the evaluation script.

In [26]:

# with open(results_file, "r") as file:
#     results = json.load(file)["results"]

# print(results)

# table_data = {task_name: [] for task_name in eval_tasks}
# average = 0
# for task_name, task_results in results.items():
#     acc_key = "acc_norm" if "acc_norm" in task_results else "acc"
#     acc = task_results[acc_key] * 100
#     table_data[task_name] = [acc]
#     average += acc

# table_data["avg"] = average / len(eval_tasks)

# df = pd.DataFrame(table_data)
# pd.set_option('display.float_format', lambda x: f'{x:.2f}')

# df

{'piqa': {'acc': 0.720348204570185, 'acc_stderr': 0.010471899530306562, 'acc_norm': 0.7159956474428727, 'acc_norm_stderr': 0.010521147542454217}, 'boolq': {'acc': 0.6042813455657492, 'acc_stderr': 0.008552742471459795}}


Unnamed: 0,boolq,piqa,avg
0,60.43,71.6,66.01
