<a href="https://colab.research.google.com/github/j0rdan0/AI-notebooks/blob/main/qwen2_7b_mC4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#!pip install huggingface_hub
#!pip install  BitsAndBytes
#!pip install peft # for LoRA
#!pip install evaluate

In [None]:
def hf_auth():
  import huggingface_hub
  from google.colab import userdata

  hf_token = userdata.get('HF_TOKEN')
  huggingface_hub.login(token=hf_token)

In [None]:
# TODO: train an small LM with https://huggingface.co/datasets/uonlp/CulturaX for RO language, using LoRA

model_name = "Qwen/Qwen2-7B-Instruct"

def generate_base_model(model_name):
  from transformers import AutoTokenizer, AutoModelForCausalLM
  from accelerate.test_utils.testing import get_backend

  device,_,_ = get_backend()
  hf_auth()

  model = AutoModelForCausalLM.from_pretrained(model_name,device_map="auto",attn_implementation="flash_attention_2",torch_dtype="auto").to(device)

  return model

In [None]:
from peft import LoraConfig, TaskType,get_peft_model

def generate_peft_model(model):
  peft_config = LoraConfig(task_type=TaskType.CAUSAL_LM, inference_mode=False, r=8, lora_alpha=32, lora_dropout=0.1)
  return get_peft_model(model,peft_config)

In [None]:
def generate_tokenizer(model_name):
  tokenizer = AutoTokenizer.from_pretrained(model_name,padding_side="left")

In [None]:
def save_pretrained_model(model,model_name):
  merged_model = model.merge_and_unload()
  merged_model.save_pretrained(model_name)


In [None]:
def tokenize_dataset_rm(dataset,tokenizer):
    return dataset.map(lambda sample: tokenizer(sample["text"],truncation=True,padding='max_length'),batched=True,remove_columns=sample.column_names) # we dont need any columns anymore


In [None]:
from dataset import load_dataset

def process_dataset(tokenizer):
  dataset_name = ("uonlp/CulturaX","ro")
  dataset = load_dataset(dataset_name[0],dataset_name[1],streaming=True,split="train")
  return tokenize_dataset_rm(dataset,tokenizer)

In [None]:
from transformers import DataCollatorForLanguageModeling

def generate_data_collator(tokenizer):
  tokenizer.pad_token = tokenizer.eos_token
  return DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [None]:
def generate_trainer(model,train_dataset,data_collator,tokenizer):
  training_args = TrainingArguments(
    output_dir="qwen2_7B_mC4_ro",
    eval_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    push_to_hub=True,
)
  trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=lm_dataset["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)
  return trainer


In [None]:
def push_model(trainer):
  trainer.push_to_hub()

In [None]:
model_name = "Qwen/Qwen2-7B-Instruct"

def main():
  hf_auth()
  model = generate_base_model(model_name)
  model = generate_peft_model(model)
  tokenizer = generate_tokenizer(model_name)
  dataset = process_dataset(tokenizer)
  data_collator = generate_data_collator(tokenizer)

  trainer = generate_trainer(model,dataset,data_collator,tokenizer)

  trainer.train()
  #push_model(trainer)
