In [2]:
import json
with open('/workspace/CS762_Project/Data_files/final_seed_data.json', 'r') as json_file:
    data = json.load(json_file)

In [None]:
args = {'local_rank': -1, 'save_total_limit': 5, 'per_device_train_batch_size': 4, 'per_device_eval_batch_size': 1, 'gradient_accumulation_steps': 4, 'learning_rate': 0.0002, 'max_grad_norm': 0.3, 'weight_decay': 0.001, 'lora_alpha': 16, 'lora_dropout': 0.1, 'lora_r': 64, 'max_seq_length': 512, 'model_name': '/workspace/CS762_Project/CodeLlama-7b-Python-hf', 'dataset_name': '/workspace/CS762_Project/generated_data', 'use_4bit': True, 'use_nested_quant': False, 'bnb_4bit_compute_dtype': 'float16', 'bnb_4bit_quant_type': 'nf4', 'num_train_epochs': 1, 'fp16': True, 'bf16': False, 'packing': False, 'gradient_checkpointing': True, 'optim': 'paged_adamw_32bit', 'lr_scheduler_type': 'constant', 'max_steps': 10000, 'warmup_ratio': 0.03, 'group_by_length': True, 'save_steps': 10, 'logging_steps': 10, 'merge_and_push': False, 'output_dir': './results'}

In [13]:
import random
with open('/workspace/CS762_Project/Data_files/final_seed_data.json', 'r') as json_file:
        data = json.load(json_file)
for idx in range(len(data)):
    data[idx]['cluster'] = random.randint(0,4)
clustered_data = {}
for i in data:
    if i['cluster'] not in clustered_data:
        clustered_data[i['cluster']] = []
    clustered_data[i['cluster']].append(i)

In [14]:
{k:len(v) for k,v in clustered_data.items()}

{0: 16, 4: 17, 1: 8, 2: 9, 3: 8}

In [None]:
def create_and_prepare_model(args):
    compute_dtype = getattr(torch, args.bnb_4bit_compute_dtype)

    bnb_config = BitsAndBytesConfig(
        load_in_4bit=args.use_4bit,
        bnb_4bit_quant_type=args.bnb_4bit_quant_type,
        bnb_4bit_compute_dtype=compute_dtype,
        bnb_4bit_use_double_quant=args.use_nested_quant,
    )

    if compute_dtype == torch.float16 and args.use_4bit:
        major, _ = torch.cuda.get_device_capability()
        if major >= 8:
            print("=" * 80)
            print("Your GPU supports bfloat16, you can accelerate training with the argument --bf16")
            print("=" * 80)

    # Load the entire model on the GPU 0
    # switch to `device_map = "auto"` for multi-GPU
    device_map = {"": 0}

    model = AutoModelForCausalLM.from_pretrained(
        args.model_name, 
        quantization_config=bnb_config, 
        device_map=device_map, 
        use_auth_token=True
    )
    
    # check: https://github.com/huggingface/transformers/pull/24906
    model.config.pretraining_tp = 1 

    peft_config = LoraConfig(
        lora_alpha=script_args.lora_alpha,
        lora_dropout=script_args.lora_dropout,
        r=script_args.lora_r,
        bias="none",
        task_type="CAUSAL_LM", 
    )

    tokenizer = AutoTokenizer.from_pretrained(script_args.model_name, trust_remote_code=True)
    tokenizer.pad_token = tokenizer.eos_token

    return model, peft_config, tokenizer

def train_model(script_args):
    training_arguments = TrainingArguments(
        output_dir=script_args.output_dir,
        per_device_train_batch_size=script_args.per_device_train_batch_size,
        per_device_eval_batch_size=script_args.per_device_eval_batch_size,
        gradient_accumulation_steps=script_args.gradient_accumulation_steps,
        evaluation_strategy='steps',
        eval_steps=script_args.save_steps,
        optim=script_args.optim,
        save_steps=script_args.save_steps,
        logging_steps=script_args.logging_steps,
        learning_rate=script_args.learning_rate,
        fp16=script_args.fp16,
        bf16=script_args.bf16,
        max_grad_norm=script_args.max_grad_norm,
        # max_steps=script_args.max_steps,
        warmup_ratio=script_args.warmup_ratio,
        group_by_length=script_args.group_by_length,
        lr_scheduler_type=script_args.lr_scheduler_type,
        num_train_epochs = script_args.num_train_epochs,
        save_total_limit=script_args.save_total_limit,
        metric_for_best_model='eval_loss',
        # save_steps=script_args.save_steps,
        save_strategy='steps'
    )
    
    model, peft_config, tokenizer = create_and_prepare_model(script_args)
    model.config.use_cache = False
    # dataset = load_dataset(script_args.dataset_name, split="train")
    full_dataset = DatasetDict.load_from_disk(script_args.dataset_name)
    # Fix weird overflow issue with fp16 training
    tokenizer.padding_side = "right"
    
    trainer = SFTTrainer(
        model=model,
        train_dataset= full_dataset['train'],
        eval_dataset = full_dataset['test'],
        peft_config=peft_config,
        dataset_text_field="prompt",
        max_seq_length=script_args.max_seq_length,
        tokenizer=tokenizer,
        args=training_arguments,
        packing=script_args.packing
    )
    
    trainer.train()

In [None]:
model_list = ['/workspace/CS762_Project/phi-2']
k_list = [1, 5, 10]
for model in model_list:
    for k in k_list:
        # load data_1, data_5, data_10
        