In [1]:
!pip install -U transformers
!pip install -U datasets
!pip install -U accelerate
!pip install -U peft
!pip install -U trl
!pip install -U bitsandbytes
!pip install -U wandb

Collecting transformers
  Using cached transformers-4.47.0-py3-none-any.whl.metadata (43 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Using cached tokenizers-0.21.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Using cached transformers-4.47.0-py3-none-any.whl (10.1 MB)
Using cached tokenizers-0.21.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.0 MB)
Installing collected packages: tokenizers, transformers
  Attempting uninstall: tokenizers
    Found existing installation: tokenizers 0.20.3
    Uninstalling tokenizers-0.20.3:
      Successfully uninstalled tokenizers-0.20.3
  Attempting uninstall: transformers
    Found existing installation: transformers 4.46.3
    Uninstalling transformers-4.46.3:
      Successfully uninstalled transformers-4.46.3
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tr

In [2]:
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import (
    LoraConfig,
    PeftModel,
    prepare_model_for_kbit_training,
    get_peft_model,
)
import os, torch, wandb
from datasets import load_dataset
from trl import SFTTrainer, setup_chat_format

In [3]:
from huggingface_hub import login
from google.colab import userdata

hf_token = userdata.get('HugFace')

login(token = hf_token)

wb_token = userdata.get('wandb')

wandb.login(key=wb_token)
run = wandb.init(
    project='Fine-tune Llama 3.2 3B Instruct on IELTS WT2',
    job_type="training",
    anonymous="allow"
)

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mharyanto[0m ([33mharyanto-universitas-pendidikan-indonesia[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [4]:
base_model = "meta-llama/Llama-3.2-3B-Instruct"
dataset_name = "chillies/IELTS-writing-task-2-evaluation"
new_model = "Llama-3.2-1B-chat-finetune-WT2"

In [5]:
torch_dtype = torch.float16
attn_implementation = "eager"

In [6]:
# QLoRA config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch_dtype,
    bnb_4bit_use_double_quant=True,
)

# Load model
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=bnb_config,
    device_map="auto",
    attn_implementation=attn_implementation
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [7]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model)
tokenizer.chat_template = None
model, tokenizer = setup_chat_format(model, tokenizer)

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


In [8]:
# LoRA config
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=['up_proj', 'down_proj', 'gate_proj', 'k_proj', 'q_proj', 'v_proj', 'o_proj']
)
model = get_peft_model(model, peft_config)

In [9]:
data_files = {"train": "train.csv", "test": "test.csv"}
dataset = load_dataset(dataset_name, data_files=data_files)

In [10]:
#Importing the dataset
# train_dataset = load_dataset(dataset_name, split="all")

def format_chat_template(row):
    row_json = [ {"role": "system", "content": row["prompt"]},
                 {"role": "user", "content": row["essay"]},
                 {"role": "assistant", "content": row["evaluation"]},
                 {"role": "assistant", "content": row["band"]}]
    row["text"] = tokenizer.apply_chat_template(row_json, tokenize=False)
    return row

dataset = dataset.map(
    format_chat_template,
    num_proc=torch.cuda.device_count()
)

dataset['train']['prompt'][3]

'Interviews form the basic selection criteria for most large companies. However, some poeple think that interviews is not a reliable mthod of choosing whom to employ and there are other better methods. To what extent do you agree or disagree?'

In [11]:
# dataset = dataset.train_test_split(test_size=0.1)

In [12]:
training_arguments = TrainingArguments(
    output_dir=new_model,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=2,
    optim="paged_adamw_32bit",
    num_train_epochs=1,
    evaluation_strategy="steps",
    eval_steps=0.2,
    logging_steps=1,
    warmup_steps=10,
    logging_strategy="steps",
    learning_rate=2e-4,
    fp16=False,
    bf16=False,
    group_by_length=True,
    report_to="wandb"
)



In [13]:
# Limit the training dataset to 2000 samples
train_dataset = dataset["train"].select(range(2000))

In [14]:
trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=dataset["test"],
    peft_config=peft_config,
    max_seq_length=512,
    dataset_text_field="text",
    tokenizer=tokenizer,
    args=training_arguments,
    packing= False,
)


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/491 [00:00<?, ? examples/s]

In [15]:
trainer.train()



Step,Training Loss,Validation Loss
200,2.4747,2.24467
400,1.6689,2.202331
600,2.1105,2.189639
800,2.2453,2.171577
1000,1.3291,2.169861


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


TrainOutput(global_step=1000, training_loss=2.088235293567181, metrics={'train_runtime': 2450.7905, 'train_samples_per_second': 0.816, 'train_steps_per_second': 0.408, 'total_flos': 1.7466625397090304e+16, 'train_loss': 2.088235293567181, 'epoch': 1.0})

In [16]:
wandb.finish()
model.config.use_cache = True

0,1
eval/loss,█▄▃▁▁
eval/runtime,█▇█▄▁
eval/samples_per_second,▁▂▁▅█
eval/steps_per_second,▁▂▁▅█
train/epoch,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇███
train/global_step,▁▁▁▁▁▁▂▂▂▂▃▃▃▃▃▃▃▃▃▃▄▄▄▄▅▅▅▅▅▆▆▆▇▇▇▇▇███
train/grad_norm,▂▃▂▂▅▂▁▂▄▃▂▂▃▅█▃▂▃▂▄▃▃▃▃▁▄▂▃▃▃▇▄▂▂▂▅▃▄▅▂
train/learning_rate,▇███▇▇▇▇▇▇▆▆▆▆▅▅▅▅▅▄▄▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▁▁▁▁
train/loss,▆▆▄▃█▄▅▅▃▆▄▅▁▄▅▃▄▆▃▆▅▆▃▆▆▃▂▆▄▇▅▃▂▄▃▄▃▄▅▄

0,1
eval/loss,2.16986
eval/runtime,168.8698
eval/samples_per_second,2.908
eval/steps_per_second,2.908
total_flos,1.7466625397090304e+16
train/epoch,1.0
train/global_step,1000.0
train/grad_norm,0.8543
train/learning_rate,0.0
train/loss,1.3291


In [18]:
messages = [
    {
        "role": "system",
        "content": "Interview form the basic selection criteria for most large companies. However, some people think that interview is not a reliable method of choosing whom to employ and there are better methods. To what extent to you agree or disagree?",
        "role": "user",
        "content": "To agree or disagree with the statement that interviews are the basic filteration criteria is an important issue. Putting the discussion in a wider context, interviews has always been debatable. Even though some people think that there are better methods for employing a resource rather than interviews , I wholeheartedly believe that interview is a good method for recruiting candidates. First I will discuss some arguments supporting my ideas about this statement, after which some aspects against that will be presented. On the one hand, many people agree with this statement for many noteworthy reasons. The most remarkable is that the recruiters can get an idea about the personalitty and skills of the potential employees .For instance,when the person is asked about any topic and he answers it in a concise and crisp manner,then the recruiter gets to know he is suitable for the job. Another key reason is that if a candidate is asked about case studies then the recruiters can judge the personality traits of that employee and also the ability to think outside the box. On the other hand, other people disagree with this statement for many reasons. They believe that other modes of recruiting like written tests and group discussions will help understand the mindset in a better manner.Written tests help in evaluating the technical or theoretical knowlege of a person. Group discussions help in getting a grasp of the conversational skills that he/she possesses.For example,in sales and marketing jobs conversational skills play a major role. All in all, when all the specific reasons and relevant examples are considered and evaluated, I strongly agree with the idea supporting this statement because its benefits outweigh its drawbacks."
    }
]

prompt = tokenizer.apply_chat_template(messages, tokenize=False,
                                       add_generation_prompt=True)

inputs = tokenizer(prompt, return_tensors='pt', padding=True,
                   truncation=True, max_length=512).to("cuda")


outputs = model.generate(**inputs, max_length=1000, num_return_sequences=1)#, num_beams=10, early_stopping=False, repetition_penalty=2.2)

text = tokenizer.decode(outputs[0], skip_special_tokens=True)

print(text.split("assistant")[1])


## Task Achievement:
- The candidate has effectively addressed the given task by presenting a clear stance on the importance of interviews as a filtering criterion.
- The ideas presented are relevant and coherent, covering all aspects of the task.
- The arguments and evidence provided support the candidate's position.
- The candidate has fulfilled the requirements of the task by providing a balanced discussion of both sides of the argument.
- Suggested Band Score (Task Achievement): 8

## Coherence and Cohesion:
- The essay is well-organized, with clear transitions between sentences and paragraphs.
- Connecting words and phrases are used effectively to maintain a smooth progression of ideas.
- The logical sequence and arrangement of information are well-structured.
- Suggested Band Score (Coherence and Cohesion): 8

## Lexical Resource (Vocabulary):
- The candidate demonstrates a good range of vocabulary, using appropriate and varied words.
- There are some minor errors in vocabulary 

In [19]:
messages = [
    {
        "role": "system",
        "content": "The increase in the production of consumer goods results in damage to the natural environment. What are the causes of this? What can be done to solve this problem?",
        "role": "user",
        "content": "The production of new items and objects have been a well discussed topic recently in our society. Many people explain that the growth of world economy has brought harmful effect in the atmosphere. Nonetheless, others argue that it is necessary the creation of new goods as well as services. In this essay, I am going to analize the issue of new things, and at the end, I will give possible solutions of these problems. On one hand, undoubtedly, it is a reality that new goods have helped to increase the air pollution. For example: In the last years, the population has been increasing dramatically. Therefore, we need more products in order to conservate the life quality. As result, we demand more services and items such as clothes, cars, and food thus we produce more pollution along with garbage. Under these circumstances our environment was damaged for the increase of new products. In addition, many animals are dying out for the destruction of their natural ecosystem."
    }
]

prompt = tokenizer.apply_chat_template(messages, tokenize=False,
                                       add_generation_prompt=True)

inputs = tokenizer(prompt, return_tensors='pt', padding=True,
                   truncation=True, max_length=512).to("cuda")


outputs = model.generate(**inputs, max_length=1000, num_return_sequences=1)#, num_beams=10, early_stopping=False, repetition_penalty=2.2)

text = tokenizer.decode(outputs[0], skip_special_tokens=True)

print(text.split("assistant")[1])


**Task Achievement:**

- The candidate has partially addressed the task by discussing the impact of new production on the environment.
- The ideas presented are somewhat relevant to the task, but they lack clarity and coherence.
- Not all aspects of the task have been adequately covered, as the candidate has not provided a balanced perspective on the issue.
- The essay lacks a clear structure and logical flow of ideas.
- Suggested Band Score (Task Achievement): 5.5

**Coherence and Cohesion:**

- The essay lacks a clear and logical structure.
- Transitions between sentences and paragraphs are not smooth, and the overall organization of the text is weak.
- The use of connecting words and phrases is limited, making the essay difficult to follow.
- The essay lacks a cohesive and cohesive structure, making it difficult to understand the main ideas.
- Suggested Band Score (Coherence and Cohesion): 4.5

**Lexical Resource (Vocabulary):**

- The vocabulary used in the essay is limited and re

In [20]:
trainer.model.save_pretrained(new_model)
trainer.model.push_to_hub(new_model, use_temp_dir=False)



README.md:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/1.67G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/123Harr/Llama-3.2-1B-chat-finetune-WT2/commit/4f7bb56e20504567347f8602b582929916e4bc31', commit_message='Upload model', commit_description='', oid='4f7bb56e20504567347f8602b582929916e4bc31', pr_url=None, repo_url=RepoUrl('https://huggingface.co/123Harr/Llama-3.2-1B-chat-finetune-WT2', endpoint='https://huggingface.co', repo_type='model', repo_id='123Harr/Llama-3.2-1B-chat-finetune-WT2'), pr_revision=None, pr_num=None)