# Tutorial on instruction tuning of LLama

In [None]:
# install dependencies

!pip install transformers --quiet
!pip install peft --quiet
!pip install datasets --quiet
!pip install deepspeed --quiet

In [None]:
import os
import sys
import logging
from dataclasses import dataclass, field

import torch
import datasets
import transformers
from transformers import AutoConfig, AutoModelForCasualLM, AutoTokenizer, HfArgumentParser, TrainingArguments

logger = logging.getLogger(__name__)

In [None]:
IGNORE_INDEX = -100


In [None]:
@dataclass
class ModelArguments:
    


In [None]:
@dataclass
class SftTrainingArguments(TrainingArguments):
    model

In [None]:
parser = HfArgumentParser((ModelArguments, DataTrainingArguments, MyTrainingArguments))

model_args, data_args, training_args = parser.parse_args_into_dataclasses()

logging.basicConfig(format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO,  # if training_args.local_rank in [-1, 0] else logging.WARN,
        handlers=[logging.StreamHandler(sys.stdout)],)
log_level = training_args.get_process_log_level()
logger.setLevel(log_level)
logger.warning(
        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
        + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
    )

set_seed(training_args.seed)



In [None]:
config_kwargs = {
    "cache_dir": model_args.cache_dir,
    "revision": model_args.model_revision,
    "use_auth_token": True if model_args.use_auth_token else None,
}

if model_args.config_name:
    config = AutoConfig.from_pretrained(model_args.config_name, **config_kwargs)
elif model_args.model_name_or_path:
    config = AutoConfig.from_pretrained(model_args.model_name_or_path, **config_kwargs)
else:
    raise ValueError("config_name or model_name_or_path")

tokenizer_kwargs = {
    "cache_dir": model_args.cache_dir,
    "use_fast": model_args.use_fast_tokenizer,
    "revision": model_args.model_revision,
    "use_auth_token": True if model_args.use_auth_token else None,
}
tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name, **tokenizer_kwargs)
if tokenizer.pad_token is None:
    print(f"Adding pad token {DEFAULT_PAD_TOKEN}")
    tokenizer.add_special_tokens(dict(pad_token=DEFAULT_PAD_TOKEN))


In [None]:
data_collator = DataCollatorForSupervisedDataset(tokenizer=tokenizer)
train_dataset = None
eval_dataset = None

if training_args.do_train:
    with training_args.main_process_first(desc="loading and tokenization"):
        path = Path(data_args.dataset_dir)
        files = [os.path.join(path,file.name) for file in path.glob("*.json")]
        logger.info(f"Training files: {' '.join(files)}")
        train_dataset = build_instruction_dataset(
            data_path=files,
            tokenizer=tokenizer,
            max_seq_length=data_args.max_seq_length,
            data_cache_dir = None,
            preprocessing_num_workers = data_args.preprocessing_num_workers)
    logger.info(f"Num train_samples  {len(train_dataset)}")
    logger.info("training example:")
    logger.info(tokenizer.decode(train_dataset[0]['input_ids']))
if training_args.do_eval:
    with training_args.main_process_first(desc="loading and tokenization"):
        files = [data_args.validation_file]
        logger.info(f"Evaluation files: {' '.join(files)}")
        eval_dataset = build_instruction_dataset(
            data_path=files,
            tokenizer=tokenizer,
            max_seq_length=data_args.max_seq_length,
            data_cache_dir = None,
            preprocessing_num_workers = data_args.preprocessing_num_workers)
    logger.info(f"Num eval_samples  {len(eval_dataset)}")
    logger.info("eval example:")
    logger.info(tokenizer.decode(eval_dataset[0]['input_ids']))