# gemma-2b-it finetuning

### fine-tuning started!

In [1]:
import torch
import pandas as pd
import numpy as np

from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer
#from transformers import Trainer
from transformers import DataCollatorWithPadding
from transformers import TrainingArguments

  from pandas.core.computation.check import NUMEXPR_INSTALLED


In [2]:
from datasets import load_dataset

In [3]:
ds = load_dataset("iamtarun/python_code_instructions_18k_alpaca")

In [4]:
ds

DatasetDict({
    train: Dataset({
        features: ['instruction', 'input', 'output', 'prompt'],
        num_rows: 18612
    })
})

In [5]:
access_token = "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"

In [6]:
model_name = "google/gemma-2b-it"
tokenizer_name = "google/gemma-2b-it"

In [7]:
tokenizer = AutoTokenizer.from_pretrained(model_name,
                                          token=access_token,
                                          truncation=True,
                                          padding=True,
                                          max_length=200,
                                         )

In [8]:
model = AutoModelForCausalLM.from_pretrained(model_name,
                                            token=access_token,
                                            device_map="auto",
                                            torch_dtype=torch.float32)

Gemma's activation function should be approximate GeLU and not exact GeLU.
Changing the activation function to `gelu_pytorch_tanh`.if you want to use the legacy `gelu`, edit the `model.config` to set `hidden_activation=gelu`   instead of `hidden_act`. See https://github.com/huggingface/transformers/pull/29402 for more details.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [9]:
!nvidia-smi

Fri Apr 19 05:02:38 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.54.03              Driver Version: 535.54.03    CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  CUDA GPU                       On  | 00000000:E3:00.0 Off |                    0 |
| N/A   45C    P0              73W / 300W |  10024MiB / 81074MiB |      5%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [10]:
ds

DatasetDict({
    train: Dataset({
        features: ['instruction', 'input', 'output', 'prompt'],
        num_rows: 18612
    })
})

In [11]:
def process(row):
    return tokenizer(row["instruction"],row["input"],row['output'],row["prompt"],truncation=True, padding=True, max_length=200)

In [12]:
import multiprocessing

In [13]:
ds = ds.map(process,
            num_proc=multiprocessing.cpu_count(),
            load_from_cache_file=True,
            batched=True
            )
train_dataset = ds["train"]

In [14]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt")

In [15]:
training_args = TrainingArguments(output_dir="./results",
                                 num_train_epochs=3,
                                 per_device_train_batch_size=4,
                                 per_device_eval_batch_size=4,
                                 weight_decay=0.01,
                                 logging_dir="./logs",
                                 logging_steps=300,
                                 warmup_steps=500,
                                 dataloader_num_workers=multiprocessing.cpu_count(),
                                 eval_accumulation_steps=1,
                                 gradient_accumulation_steps=2,
                                 optim="adamw_torch")

In [16]:
# model_trainer = Trainer(model,
#                        args=training_args,
#                         train_dataset = train_dataset,
#                        data_collator=data_collator,
#                        tokenizer = tokenizer)

In [17]:
# model_trainer.train()

In [18]:
!nvidia-smi

Fri Apr 19 05:02:39 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.54.03              Driver Version: 535.54.03    CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  CUDA GPU                       On  | 00000000:E3:00.0 Off |                    0 |
| N/A   45C    P0              74W / 300W |  10024MiB / 81074MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
                                                                    

## model evaluation

![image.png](attachment:d1c15241-bca2-44ea-ba88-db0cadd4bdf2.png)

---

model training log chart
> 1. loss값은 감소추세에서 일정 선형을 이루어 내었음.
> 2. learning-rate(학습 곡선)의 선형 감소가 불안한 감이 없지않아 존재함.

In [19]:
! nvidia-smi

Fri Apr 19 05:02:40 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.54.03              Driver Version: 535.54.03    CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  CUDA GPU                       On  | 00000000:E3:00.0 Off |                    0 |
| N/A   45C    P0              74W / 300W |  10024MiB / 81074MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [20]:
# ! pip3 freeze | grep evaluate

In [21]:
from evaluate import Text2TextGenerationEvaluator as T2TGE

import numpy as np
import evaluate

In [22]:
acc_metric = evaluate.load("accuracy")

In [23]:
def compute_metric(eval_pred) :
    logit, labels = eval_pred
    predict = np.argmax(logit, axis=-1)
    return acc_metric.compute(predictions=predict, references=labels)

In [24]:
from sklearn.model_selection import train_test_split

In [25]:
train_dataset, eval_dataset = train_test_split(train_dataset, test_size=.2, random_state=1401)

## Train again... for evaluating model's score and prediction

In [26]:
training_args_for_eval = TrainingArguments(output_dir="./eval_results_2",
                                 num_train_epochs=3,
                                 per_device_train_batch_size=4,
                                 per_device_eval_batch_size=4,
                                 weight_decay=0.01,
                                 logging_dir="./eval_logs_2",
                                 logging_steps=300,
                                 warmup_steps=500,
                                 dataloader_num_workers=0,
                                 eval_accumulation_steps=1,
                                 gradient_accumulation_steps=2,
                                 optim="adamw_torch",
                                evaluation_strategy="epoch")

In [29]:
model_trainer_for_eval = Trainer(model,
                       args=training_args_for_eval,
                        train_dataset = train_dataset_for_eval,
                        eval_dataset = eval_dataset,
                       data_collator=data_collator,
                       tokenizer = tokenizer,
                        compute_metrics=compute_metric)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [30]:
model_trainer_for_eval.train()

[34m[1mwandb[0m: Currently logged in as: [33mrlfdnjs9839[0m ([33mlineworld[0m). Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss


AttributeError: 'list' object has no attribute 'keys'

In [28]:
ds = ds.map(process,
            num_proc=multiprocessing.cpu_count(),
            load_from_cache_file=True,
            batched=True
            )
train_dataset_for_eval = ds["train"]

In [None]:
eval_result = model_trainer_for_eval.evaluate()

In [None]:
model_trainer_for_eval.predict(eval_dataset["input"])

## dataset train_test_split at this time

In [32]:
import torch
import pandas as pd
import numpy as np
import multiprocessing

from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import Trainer, TrainingArguments
from transformers import DataCollatorWithPadding
from datasets import load_dataset