# Training a Causal Language Model on external sensitive data using LoRA technique

## (will be done autonomously on order to avoid data leakage)
## LLM = BigScience Bloom-3b (*open-source*)

### JSON Dataset preprocessing

In [None]:
def preprocess_text(text: str) -> str:
    text = text.replace('\n', ' ')
    return text

In [None]:
def preprocess_data(dataset_path: Path, min_length: int, tokenizer: PreTrainedTokenizer) -> str:
    """Prepare dataset for training from the jsonl file.

    Args:
        dataset_path (Path): Extracted text from the book
        min_length (int): Filter pages without text
        tokenizer (PreTrainedTokenizer): HuggingFace tokenizer

    Yields:
        str: text of the pages
    """
    with open(dataset_path, 'r') as f:
        grouped_text = ""
        for line in f:
            elt = json.loads(line)
            text: str = list(elt.values())[0]
            if len(text) > min_length:
                grouped_text += text
        # End of paragraphs defined by ".\n is transformed into EOS token"
        grouped_text = grouped_text.replace(".\n", "." + tokenizer.eos_token)
        return preprocess_text(grouped_text)S

In [None]:
def tokenize(element: Mapping, tokenizer: Callable, 
             context_length: int) -> str:
    inputs = tokenizer(element['text'], truncation=True, return_overflowing_tokens=True, 
                       return_length=True, max_length=context_length)
    inputs_batch = []
    for length, input_ids in zip(inputs['length'], inputs['input_ids']):
        if length == context_length: # We drop the last input_ids that are shorter than max_length
            inputs_batch.append(input_ids)
    return {"input_ids": inputs_batch}

In [None]:
def prepare_dataset(dataset_path: Path, min_length: int, context_length: int, 
                    test_size: float, shuffle: bool, hf_repo: str) -> None:
    """Prepare dataset for training and push it to the hub.
    """
    tokenizer =  AutoTokenizer.from_pretrained("bigscience/bloom-3b")
    LOGGER.info(f'Start preparing dataset from {dataset_path}')
    text = preprocess_data(dataset_path=dataset_path, min_length=min_length, tokenizer=tokenizer)
    dataset = Dataset.from_dict({'text': [text]})
    tokenized_dataset = dataset.map(tokenize, batched=True, fn_kwargs={'tokenizer': tokenizer, 'context_length': context_length},
                                         remove_columns=dataset.column_names)
    LOGGER.info(f'The tokenized dataset is composed of {tokenized_dataset.num_rows} elements, each one composed of {context_length} tokens.')
    tokenized_dataset_dict = tokenized_dataset.train_test_split(test_size=test_size, shuffle=shuffle)
    LOGGER.info(f'The training dataset is composed of {tokenized_dataset_dict["train"].num_rows} elements, the test dataset is composed of {tokenized_dataset_dict["test"].num_rows} elements.')
    tokenized_dataset_dict.push_to_hub(hf_repo)
    LOGGER.info(f'Preparing dataset finished.')

### Train Causal language model

In [None]:
import bitsandbytes
import accelerate
from transformers import AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained("bigscience/bloom-3b", 
                  device_map="auto", load_in_8bit=True)

In [None]:
for param in model.parameters():
  param.requires_grad = False  # freeze the model - train adapters later

In [None]:
import torch

if param.ndim == 1:
  # cast the small parameters (e.g. layernorm) to fp32 for stability
  param.data = param.data.to(torch.float32)

In [None]:
class CastOutputToFloat(nn.Sequential):
    def forward(self, x): 
        return super().forward(x).to(float32)
model.lm_head = CastOutputToFloat(model.lm_head)

In [None]:
from peft import LoraConfig, get_peft_model 

lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM" # set this for CLM or Seq2Seq
)

model = get_peft_model(model, lora_config)

In [None]:
def print_trainable_parameters(model):
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    return f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"



In [None]:
print_trainable_parameters(model)

In [None]:
from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import load_dataset

hf_repo = ""
dataset = load_dataset(hf_repo)

tokenizer = AutoTokenizer.from_pretrained("bigscience/bloom-3b")
trainer = Trainer(
    model=model,
    train_dataset=dataset['train'],
    args=TrainingArguments(
        per_device_train_batch_size=1,
        gradient_accumulation_steps=1,
        warmup_steps=100,
        weight_decay=0.1,
        num_train_epochs=3,
        learning_rate=2e-4, 
        fp16=True,
        logging_steps=1, 
        output_dir="outputs"
    )
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=mlm)

In [None]:
model.config.use_cache = False  # silence warnings
trainer.train()

In [None]:
model.push_to_hub(hf_repo)

### Inference using fine-tuned model

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftConfig, PeftModel

# Import the model
config = PeftConfig.from_pretrained(hf_repo)
model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path, return_dict=True, load_in_8bit=True, device_map='auto')
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)
# Load the Lora model
model = PeftModel.from_pretrained(model, hf_repo)

In [None]:
prompt = "The hobbits were so suprised seeing their friend"

inputs = tokenizer(prompt, return_tensors="pt")
tokens = model.generate(
    **inputs,
    max_new_tokens=100,
    temperature=1,
    eos_token_id=tokenizer.eos_token_id,
    early_stopping=True
)