-
Notifications
You must be signed in to change notification settings - Fork 373
Support for nanotron #11
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
1a5d346
5ca5835
60fcb76
ac4a64a
fbb2321
aefd6d5
3a6922c
39befd2
33dbd59
eb2f4f4
e47bad9
e93fb58
1c98e44
8fa1df0
aab3f81
0bec0db
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -37,4 +37,5 @@ repos: | |
| rev: 'v0.1.6' | ||
| hooks: | ||
| - id: ruff | ||
| args: ['--fix'] | ||
| - id: ruff-format | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,92 @@ | ||
| import argparse | ||
|
|
||
| from lighteval.main_accelerate import CACHE_DIR, main | ||
|
|
||
|
|
||
| def get_parser(): | ||
clefourrier marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| parser = argparse.ArgumentParser() | ||
| group = parser.add_mutually_exclusive_group(required=True) | ||
| task_type_group = parser.add_mutually_exclusive_group(required=True) | ||
|
|
||
| # Model type 1) Base model | ||
| weight_type_group = parser.add_mutually_exclusive_group() | ||
| weight_type_group.add_argument( | ||
| "--delta_weights", | ||
| action="store_true", | ||
| default=False, | ||
| help="set to True of your model should be merged with a base model, also need to provide the base model name", | ||
| ) | ||
| weight_type_group.add_argument( | ||
| "--adapter_weights", | ||
| action="store_true", | ||
| default=False, | ||
| help="set to True of your model has been trained with peft, also need to provide the base model name", | ||
| ) | ||
| parser.add_argument( | ||
| "--base_model", type=str, default=None, help="name of the base model to be used for delta or adapter weights" | ||
| ) | ||
|
|
||
| task_type_group.add_argument("--model_args") | ||
| parser.add_argument("--model_dtype", type=str, default=None) | ||
| parser.add_argument( | ||
| "--multichoice_continuations_start_space", | ||
| action="store_true", | ||
| help="Whether to force multiple choice continuations to start with a space", | ||
| ) | ||
| parser.add_argument( | ||
| "--no_multichoice_continuations_start_space", | ||
| action="store_true", | ||
| help="Whether to force multiple choice continuations to not start with a space", | ||
| ) | ||
| parser.add_argument("--use_chat_template", default=False, action="store_true") | ||
| # Model type 2) TGI | ||
| task_type_group.add_argument("--inference_server_address", type=str) | ||
| parser.add_argument("--inference_server_auth", type=str, default=None) | ||
| # Model type 3) Inference endpoints | ||
| task_type_group.add_argument("--endpoint_model_name", type=str) | ||
| parser.add_argument("--accelerator", type=str, default=None) | ||
| parser.add_argument("--vendor", type=str, default=None) | ||
| parser.add_argument("--region", type=str, default=None) | ||
| parser.add_argument("--instance_size", type=str, default=None) | ||
| parser.add_argument("--instance_type", type=str, default=None) | ||
| parser.add_argument("--reuse_existing", default=False, action="store_true") | ||
| # Debug | ||
| parser.add_argument("--max_samples", type=int, default=None) | ||
| parser.add_argument("--job_id", type=str, help="Optional Job ID for future reference", default="") | ||
| # Saving | ||
| parser.add_argument("--push_results_to_hub", default=False, action="store_true") | ||
| parser.add_argument("--save_details", action="store_true") | ||
| parser.add_argument("--push_details_to_hub", default=False, action="store_true") | ||
| parser.add_argument( | ||
| "--public_run", default=False, action="store_true", help="Push results and details to a public repo" | ||
| ) | ||
| parser.add_argument("--cache_dir", type=str, default=CACHE_DIR) | ||
| parser.add_argument( | ||
| "--results_org", | ||
| type=str, | ||
| help="Hub organisation where you want to store the results. Your current token must have write access to it", | ||
| ) | ||
| # Common parameters | ||
| parser.add_argument("--output_dir", required=True) | ||
| parser.add_argument("--override_batch_size", type=int, default=-1) | ||
| parser.add_argument("--dataset_loading_processes", type=int, default=1) | ||
| parser.add_argument( | ||
| "--custom_tasks_file", | ||
| type=str, | ||
| default=None, | ||
| help="Path to a file with custom tasks (a TASK list of dict and potentially prompt formating functions)", | ||
| ) | ||
| group.add_argument( | ||
| "--tasks", | ||
| type=str, | ||
| default=None, | ||
| help="Id of a task, e.g. 'original|mmlu:abstract_algebra|5' or path to a texte file with a list of tasks", | ||
| ) | ||
| parser.add_argument("--num_fewshot_seeds", type=int, default=1, help="Number of trials the few shots") | ||
| return parser | ||
|
|
||
|
|
||
| if __name__ == "__main__": | ||
| parser = get_parser() | ||
| args, unknowns = parser.parse_known_args() | ||
| main(args) | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,33 @@ | ||
| # flake8: noqa: C901 | ||
| import argparse | ||
|
|
||
| from lighteval.main_nanotron import main | ||
|
|
||
|
|
||
| def get_parser(): | ||
| parser = argparse.ArgumentParser() | ||
| parser.add_argument( | ||
| "--checkpoint-config-path", | ||
| type=str, | ||
| required=True, | ||
| help="Path to the brr checkpoint YAML or python config file, potentially on S3", | ||
| ) | ||
| parser.add_argument( | ||
| "--lighteval-override", | ||
| type=str, | ||
| help="Path to an optional YAML or python Lighteval config to override part of the checkpoint Lighteval config", | ||
| ) | ||
| parser.add_argument( | ||
| "--cache-dir", | ||
| type=str, | ||
| default="", | ||
| help="Cache directory", | ||
| ) | ||
|
|
||
| return parser | ||
|
|
||
|
|
||
| if __name__ == "__main__": | ||
| parser = get_parser() | ||
| args, unknowns = parser.parse_known_args() | ||
| main(args.checkpoint_config_path, args.lighteval_override, args.cache_dir) |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -198,6 +198,37 @@ def _sorting_criteria(self, request: GreedyUntilRequest | GreedyUntilWithLogitsR | |
| return -(len(toks) + gen_length) | ||
|
|
||
|
|
||
| class GenerativeTaskDatasetNanotron(DynamicBatchDataset): | ||
| def __getitem__(self, index) -> Request: | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why do you need your own class? (Is it only to return the index with the item?)
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nathan's requirement
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes but I'm unsure why we need to grab the index for brr |
||
| """ | ||
| Get an item from the dataset depending on the split we are currently in. | ||
| For instance, if we are in split 0, we will get the item at index 0, if | ||
| we are in split 1, we will get the item at index self.split_size, etc. | ||
| Used for dynamic batching. | ||
|
|
||
| Args: | ||
| index (int): The index of the item. | ||
|
|
||
| Returns: | ||
| Any: The item at the specified index. | ||
| """ | ||
| return index, self.sorted_data[index + self.split_start] | ||
|
|
||
| def _sorting_criteria(self, request) -> int: | ||
| """ | ||
| Collate function for generating batches. | ||
|
|
||
| Args: | ||
| x (Any): The input data. | ||
|
|
||
| Returns: | ||
| Any: The collated data. | ||
| """ | ||
| toks = request.tokenized_context | ||
| gen_length = request.generation_size | ||
| return -(len(toks) + gen_length) | ||
|
|
||
|
|
||
| class GenDistributedSampler(DistributedSampler): | ||
| """A distributed sampler that copy the last element only when drop_last is False so we keep a small padding in the batches | ||
| as our samples are sorted by length. | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -18,13 +18,11 @@ | |
| TaskConfigLogger, | ||
| VersionsLogger, | ||
| ) | ||
| from lighteval.utils import is_nanotron_available | ||
| from lighteval.utils import is_nanotron_available, obj_to_markdown | ||
|
|
||
|
|
||
| if is_nanotron_available(): | ||
| from brrr.config import BrrrConfig | ||
| from brrr.experiment_loggers import obj_to_markdown | ||
| from nanotron.config import get_config_from_dict | ||
| from nanotron.config import Config, get_config_from_dict | ||
|
|
||
|
|
||
| class EnhancedJSONEncoder(json.JSONEncoder): | ||
|
|
@@ -104,81 +102,81 @@ def save( | |
|
|
||
| """ | ||
| hlog("Saving experiment tracker") | ||
| try: | ||
| date_id = datetime.now().isoformat().replace(":", "-") | ||
|
|
||
| output_dir_results = Path(output_dir) / "results" / self.general_config_logger.model_name | ||
| output_dir_details = Path(output_dir) / "details" / self.general_config_logger.model_name | ||
| output_dir_details_sub_folder = output_dir_details / date_id | ||
| output_dir_results.mkdir(parents=True, exist_ok=True) | ||
| output_dir_details_sub_folder.mkdir(parents=True, exist_ok=True) | ||
|
|
||
| output_results_file = output_dir_results / f"results_{date_id}.json" | ||
| output_results_in_details_file = output_dir_details / f"results_{date_id}.json" | ||
|
|
||
| hlog(f"Saving results to {output_results_file} and {output_results_in_details_file}") | ||
|
|
||
| to_dump = { | ||
| "config_general": asdict(self.general_config_logger), | ||
| "results": self.metrics_logger.metric_aggregated, | ||
| "versions": self.versions_logger.versions, | ||
| "config_tasks": self.task_config_logger.tasks_configs, | ||
| "summary_tasks": self.details_logger.compiled_details, | ||
| "summary_general": asdict(self.details_logger.compiled_details_over_all_tasks), | ||
| } | ||
| dumped = json.dumps(to_dump, cls=EnhancedJSONEncoder, indent=2) | ||
|
|
||
| with open(output_results_file, "w") as f: | ||
| f.write(dumped) | ||
|
|
||
| with open(output_results_in_details_file, "w") as f: | ||
| f.write(dumped) | ||
|
|
||
| for task_name, task_details in self.details_logger.details.items(): | ||
| output_file_details = output_dir_details_sub_folder / f"details_{task_name}_{date_id}.parquet" | ||
| # Create a dataset from the dictionary | ||
| try: | ||
| dataset = Dataset.from_list([asdict(detail) for detail in task_details]) | ||
| except Exception: | ||
| # We force cast to str to avoid formatting problems for nested objects | ||
| dataset = Dataset.from_list( | ||
| [{k: str(v) for k, v in asdict(detail).items()} for detail in task_details] | ||
| ) | ||
| # try: | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If you remove the high level try catch, please add other try catches to prevent the other possible failures
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. are we sure we want to silently catch mistake or should we not rather let the run fail?
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No because we still want the results to be saved locally. That way we can upload them by hand instead of having to redo the whole eval. |
||
| date_id = datetime.now().isoformat().replace(":", "-") | ||
|
|
||
| # We don't keep 'id' around if it's there | ||
| column_names = dataset.column_names | ||
| if "id" in dataset.column_names: | ||
| column_names = [t for t in dataset.column_names if t != "id"] | ||
|
|
||
| # Sort column names to make it easier later | ||
| dataset = dataset.select_columns(sorted(column_names)) | ||
| # Save the dataset to a Parquet file | ||
| dataset.to_parquet(output_file_details.as_posix()) | ||
|
|
||
| if push_results_to_hub: | ||
| self.api.upload_folder( | ||
| repo_id=self.hub_results_repo if public else self.hub_private_results_repo, | ||
| folder_path=output_dir_results, | ||
| path_in_repo=self.general_config_logger.model_name, | ||
| repo_type="dataset", | ||
| commit_message=f"Updating model {self.general_config_logger.model_name}", | ||
| ) | ||
| output_dir_results = Path(output_dir) / "results" / self.general_config_logger.model_name | ||
| output_dir_details = Path(output_dir) / "details" / self.general_config_logger.model_name | ||
| output_dir_details_sub_folder = output_dir_details / date_id | ||
| output_dir_results.mkdir(parents=True, exist_ok=True) | ||
| output_dir_details_sub_folder.mkdir(parents=True, exist_ok=True) | ||
|
|
||
| if push_details_to_hub: | ||
| self.details_to_hub( | ||
| model_name=self.general_config_logger.model_name, | ||
| results_file_path=output_results_in_details_file, | ||
| details_folder_path=output_dir_details_sub_folder, | ||
| push_as_public=public, | ||
| ) | ||
| output_results_file = output_dir_results / f"results_{date_id}.json" | ||
| output_results_in_details_file = output_dir_details / f"results_{date_id}.json" | ||
|
|
||
| hlog(f"Saving results to {output_results_file} and {output_results_in_details_file}") | ||
|
|
||
| if push_results_to_tensorboard: | ||
| self.push_results_to_tensorboard( | ||
| results=self.metrics_logger.metric_aggregated, details=self.details_logger.details | ||
| to_dump = { | ||
| "config_general": asdict(self.general_config_logger), | ||
| "results": self.metrics_logger.metric_aggregated, | ||
| "versions": self.versions_logger.versions, | ||
| "config_tasks": self.task_config_logger.tasks_configs, | ||
| "summary_tasks": self.details_logger.compiled_details, | ||
| "summary_general": asdict(self.details_logger.compiled_details_over_all_tasks), | ||
| } | ||
| dumped = json.dumps(to_dump, cls=EnhancedJSONEncoder, indent=2) | ||
|
|
||
| with open(output_results_file, "w") as f: | ||
| f.write(dumped) | ||
|
|
||
| with open(output_results_in_details_file, "w") as f: | ||
| f.write(dumped) | ||
|
|
||
| for task_name, task_details in self.details_logger.details.items(): | ||
| output_file_details = output_dir_details_sub_folder / f"details_{task_name}_{date_id}.parquet" | ||
| # Create a dataset from the dictionary | ||
| try: | ||
| dataset = Dataset.from_list([asdict(detail) for detail in task_details]) | ||
| except Exception: | ||
| # We force cast to str to avoid formatting problems for nested objects | ||
| dataset = Dataset.from_list( | ||
| [{k: str(v) for k, v in asdict(detail).items()} for detail in task_details] | ||
| ) | ||
| except Exception as e: | ||
| hlog("WARNING: Could not save results") | ||
| hlog(repr(e)) | ||
|
|
||
| # We don't keep 'id' around if it's there | ||
| column_names = dataset.column_names | ||
| if "id" in dataset.column_names: | ||
| column_names = [t for t in dataset.column_names if t != "id"] | ||
|
|
||
| # Sort column names to make it easier later | ||
| dataset = dataset.select_columns(sorted(column_names)) | ||
| # Save the dataset to a Parquet file | ||
| dataset.to_parquet(output_file_details.as_posix()) | ||
|
|
||
| if push_results_to_hub: | ||
| self.api.upload_folder( | ||
| repo_id=self.hub_results_repo if public else self.hub_private_results_repo, | ||
| folder_path=output_dir_results, | ||
| path_in_repo=self.general_config_logger.model_name, | ||
| repo_type="dataset", | ||
| commit_message=f"Updating model {self.general_config_logger.model_name}", | ||
| ) | ||
|
|
||
| if push_details_to_hub: | ||
| self.details_to_hub( | ||
| model_name=self.general_config_logger.model_name, | ||
| results_file_path=output_results_in_details_file, | ||
| details_folder_path=output_dir_details_sub_folder, | ||
| push_as_public=public, | ||
| ) | ||
|
|
||
| if push_results_to_tensorboard: | ||
| self.push_results_to_tensorboard( | ||
| results=self.metrics_logger.metric_aggregated, details=self.details_logger.details | ||
| ) | ||
| # except Exception as e: | ||
| # hlog("WARNING: Could not save results") | ||
| # hlog(repr(e)) | ||
|
|
||
| def generate_final_dict(self) -> dict: | ||
| """Aggregates and returns all the logger's experiment information in a dictionary. | ||
|
|
@@ -487,7 +485,7 @@ def push_results_to_tensorboard( # noqa: C901 | |
| if not is_nanotron_available(): | ||
| hlog_warn("You cannot push results to tensorboard with having nanotron installed. Skipping") | ||
| return | ||
| config: BrrrConfig = get_config_from_dict(self.general_config_logger.config, config_class=BrrrConfig) | ||
| config: Config = get_config_from_dict(self.general_config_logger.config, config_class=Config) | ||
| lighteval_config = config.lighteval | ||
| try: | ||
| global_step = config.general.step | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.