In [1]:
from datasets import load_dataset, DatasetDict, concatenate_datasets, Dataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# based on config
raw_datasets = load_dataset("Karayel-DDI/Turkce-hendrycks_competition_math",split='train')

In [3]:
raw_datasets_1 = load_dataset("Karayel-DDI/Turkce_Lighteval_MATH-Hard",split='train')

In [4]:
raw_datasets_2 = load_dataset("Karayel-DDI/Turkce-qwedsacf_grade-school-math-instructions",split='train').rename_columns({'instruction':'problem','response':'solution'})

In [None]:
raw_datasets_3 = load_dataset("Karayel-DDI/egitim_data",split='train').remove_columns('Unnamed: 0').rename_columns({'instruction':'problem','response':'solution'})

In [5]:
combined_dataset = concatenate_datasets([raw_datasets, raw_datasets_1, raw_datasets_2, raw_datasets_3])

In [7]:
import pandas as pd
df = pd.DataFrame(combined_dataset)

In [9]:
train_ds = Dataset.from_pandas(df[0:21400]).remove_columns('__index_level_0__').rename_columns({'problem':'prompt','solution':'message'})

In [10]:
test_ds = Dataset.from_pandas(df[21400:]).remove_columns('__index_level_0__').rename_columns({'problem':'prompt','solution':'message'})

In [11]:
dataset_dict = {"train": train_ds,
                "test": test_ds}

In [12]:
raw_datasets = DatasetDict(dataset_dict)

In [13]:
from transformers import AutoTokenizer

model_id = "Turkish-Llama-8b-Instruct-v0.1"

tokenizer = AutoTokenizer.from_pretrained(model_id)

In [14]:
tokenizer.eos_token_id

128001

In [15]:
terminators = [
    tokenizer.eos_token_id,
    tokenizer.convert_tokens_to_ids("<|eot_id|>")
]

In [16]:
def format_chat_template(row):
    row_json = [{"role": "system", "content": "Sen bir yapay zeka asistanısın. Kullanıcı sana bir görev verecek. Amacın görevi olabildiğince sadık bir şekilde tamamlamak. Görevi yerine getirirken adım adım düşün ve adımlarını gerekçelendir."},
                {"role": "user", "content": row["prompt"]},
               {"role": "assistant", "content": row["message"]}]
    row["text"] = tokenizer.apply_chat_template(row_json, tokenize=False)
    return row

In [17]:
raw_datasets = raw_datasets.map(
    format_chat_template,
    num_proc=4,
)

Map (num_proc=4): 100%|████████████████████████████████████████| 21400/21400 [00:00<00:00, 26863.49 examples/s]
Map (num_proc=4): 100%|██████████████████████████████████████████| 3777/3777 [00:00<00:00, 18276.52 examples/s]


In [18]:
raw_datasets = raw_datasets.remove_columns(['prompt','message'])

In [19]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 21400
    })
    test: Dataset({
        features: ['text'],
        num_rows: 3777
    })
})

In [20]:
if tokenizer.pad_token_id is None:
  tokenizer.pad_token_id = tokenizer.eos_token_id

if tokenizer.model_max_length > 100_000:
  tokenizer.model_max_length = 2048

In [21]:
train_dataset = raw_datasets["train"]
eval_dataset = raw_datasets["test"]

In [22]:
from transformers import BitsAndBytesConfig
import torch

quantization_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.bfloat16,
)
device_map = "auto"#{"": torch.cuda.current_device()} if torch.cuda.is_available() else None

model_kwargs = dict(
    torch_dtype="auto",
    use_cache=False, # set to False as we're going to use gradient checkpointing
    device_map=device_map,
    quantization_config=quantization_config,
)

In [23]:
from trl import SFTTrainer
from peft import LoraConfig
from transformers import TrainingArguments

In [24]:
new_model = "llama-3-8b-ddi"

In [25]:
model_id = "/karayel/Turkish-Llama-8b-Instruct-v0.1"

In [26]:
# path where the Trainer will save its checkpoints and logs
trained_model_id = "Llama-3-8B-qlora"
output_dir = 'llama_output'

training_args = TrainingArguments(
    fp16=False, 
    bf16=False,
    do_eval=True,
    evaluation_strategy="epoch",
    gradient_accumulation_steps=1,
    gradient_checkpointing=True,
    gradient_checkpointing_kwargs={"use_reentrant": False},
    learning_rate=2.0e-05,
    log_level="info",
    logging_steps=5,
    logging_strategy="steps",
    lr_scheduler_type="cosine",
    max_steps=-1,
    num_train_epochs=1,
    output_dir=output_dir,
    overwrite_output_dir=True,
    per_device_eval_batch_size=1, # originally set to 8
    per_device_train_batch_size=1, # originally set to 8
    # hub_strategy="every_save",
    # report_to="tensorboard",
    report_to="none",
    save_strategy="steps",
    save_steps=400,
    save_total_limit=None,
    seed=42,
)



In [27]:
# based on config
peft_config = LoraConfig(
        r=64,
        lora_alpha=16,
        lora_dropout=0.1,
        bias="none",
        task_type="CAUSAL_LM",
        target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
)

In [28]:
# del trainer
torch.cuda.empty_cache()

In [29]:
trainer = SFTTrainer(
        model=model_id,
        model_init_kwargs=model_kwargs,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        dataset_text_field="text",
        tokenizer=tokenizer,
        packing=True,
        peft_config=peft_config,
        max_seq_length=tokenizer.model_max_length,
    )


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.
Loading checkpoint shards: 100%|█████████████████████████████████████████████████| 4/4 [00:03<00:00,  1.03it/s]


In [30]:
train_result = trainer.train()

***** Running training *****
  Num examples = 3,926
  Num Epochs = 1
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 1
  Gradient Accumulation steps = 1
  Total optimization steps = 3,926
  Number of trainable parameters = 54,525,952
  with device_autocast_ctx, torch.cpu.amp.autocast(**cpu_autocast_kwargs), recompute_context:  # type: ignore[attr-defined]


Epoch,Training Loss,Validation Loss


Saving model checkpoint to llama_output/checkpoint-400
loading configuration file /home/camp5/karayel/Turkish-Llama-8b-Instruct-v0.1/config.json
Model config LlamaConfig {
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 128000,
  "eos_token_id": 128009,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 14336,
  "max_position_embeddings": 8192,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 8,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-05,
  "rope_scaling": null,
  "rope_theta": 500000.0,
  "tie_word_embeddings": false,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.44.0",
  "use_cache": true,
  "vocab_size": 128256
}

tokenizer config file saved in llama_output/checkpoint-400/tokenizer_config.json
Special tokens file saved in llama_output/checkpoint-400/special_tokens_map.js

KeyboardInterrupt: 

In [None]:
local_model_path = ''

In [4]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

In [5]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

In [6]:
model = AutoModelForCausalLM.from_pretrained(
    local_model_path,
    quantization_config=bnb_config, # COMMENT this in for quantization in 4bit (nf4)!
    device_map='auto', #loads automatically to gpu if there is one.
    torch_dtype=torch.bfloat16,
    trust_remote_code=True)

tokenizer = AutoTokenizer.from_pretrained(local_model_path,use_fast=True,trust_remote_code=True)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [7]:
new_hub_model_adapters_path = "/karayel/llama_output/checkpoint-3200"

In [8]:
# # load peft model with the chosen adapter
from peft import PeftModel

model_to_push = PeftModel.from_pretrained(
    model,
    new_hub_model_adapters_path,
)

In [9]:
model_to_push = model_to_push.merge_and_unload()



In [23]:
model_to_push.save_pretrained(new_hub_model_path)