## GPT FINETUNING FOR DOMAIN SPECIFIC TASKS
* Special Thanks to Iheb Chachane (Reacher) for his contribution to this project by opensourcing some of his work
* Finetuning Mistral and Zephyr models for domain specific tasks


In [None]:
! pip install -U transformers accelerate peft bitsandbytes -q

In [None]:
import random
import os
import torch
import pandas as pd
import numpy as np 
from sklearn.metrics import accuracy_score, log_loss
from sklearn.model_selection import  StratifiedKFold
from transformers import BitsAndBytesConfig, DataCollatorWithPadding, TrainingArguments, Trainer
from datasets import Dataset
from transformers import AutoTokenizer, LlamaForSequenceClassification
from peft import get_peft_config, PeftModel, PeftConfig, get_peft_model, LoraConfig, TaskType


In [None]:
class Cfg:
    model_type = "mistral" # zephyr
    model_name = "mistralai/Mistral-7B-v0.1" if model_type == "mistral" else "HuggingFaceH4/zephyr-7b-alpha"
    debug = False 
    seed = 42
    max_len = 512
    use_peft = True
    steps = 5 if debug else 50
    num_labels = 7

def set_seed(seed=42):
    '''Sets the seed of the entire notebook so results are the same every time we run.
    This is for REPRODUCIBILITY.'''
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    # When running on the CuDNN backend, two further options must be set
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    # Set a fixed value for the hash seed
    os.environ['PYTHONHASHSEED'] = str(seed)
    
set_seed(Cfg.seed)

### Data Loading
* They should have columns: prompt, context, label or question, context, label


In [None]:
train = pd.read_csv("Your path to train.csv")
test = pd.read_csv("Your path to test.csv")

display(train.head(), test.head())

In [None]:
train['input'] = "Prompt: " + train['prompt'] + "Context: " + train['context']
test['input'] = "Prompt: " + test['prompt'] + "Context: " + test['context']


### Cross Validation

In [None]:
folds = StratifiedKFold(n_splits=5)
train['fold'] = -1
for i,(train_index, test_index) in enumerate(folds.split(train,train['label'])): 
    train.loc[test_index,'fold'] = i

### Build Tokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained(Cfg.model_name, use_fast = False)
tokenizer.pad_token = tokenizer.eos_token


In [None]:
def preprocess_function(examples, max_length=Cfg.max_len):
    return tokenizer(examples["input"], truncation=True, max_length=max_length, padding="max_length")

In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding = "longest")

### Metric 

In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {"accuracy": accuracy_score(labels, predictions)}

def softmax(x):
    e_x = np.exp(x - np.max(x, axis=1, keepdims=True))
    return e_x / e_x.sum(axis=1, keepdims=True)

### Build Model

In [None]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        
        all_param += param.numel()
        if param.requires_grad:
            print(_)
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [None]:
def train():
    for fold in range(5):
        print(f"Fold {fold} ----------------------------------- TRAINING -----------------------------------")
        train_df = train[train['fold'] != fold]
        valid_df = train[train['fold'] == fold]

        train_dataset = Dataset.from_pandas(train_df)
        valid_dataset = Dataset.from_pandas(valid_df)

        train_tokenized_df = train_dataset.map(preprocess_function, batched=True)
        valid_tokenized_df = valid_dataset.map(preprocess_function, batched=True)

        if Cfg.use_peft:
            peft_config = LoraConfig(
                r=4,
                lora_alpha=16,
                lora_dropout=0.1,
                bias="none",
                task_type=TaskType.SEQ_CLS,
                inference_mode=False,
                target_modules=[
                    "q_proj",
                    "v_proj"
                ],
                )
            
            bnb_config = BitsAndBytesConfig(
                load_in_4bit=True,
                bnb_4bit_quant_type="nf4",
                bnb_4bit_use_double_quant=True,
                bnb_4bit_compute_dtype=torch.bfloat16
            )    

            base_model = LlamaForSequenceClassification.from_pretrained(
                Cfg.model_name,
                Cfg.num_labels,
                quantization_config = bnb_config,
                device_map = {"":0}
            )

            model = get_peft_model(base_model, peft_config)
            print_trainable_parameters(model)

        else:
            model = LlamaForSequenceClassification.from_pretrained(
                Cfg.model_name,
                Cfg.num_labels
            )
            print_trainable_parameters(model)

        training_args = TrainingArguments(
            output_dir=f"outputs/fold{fold}",
            learning_rate=1e-4,
            per_device_train_batch_size=1,
            per_device_eval_batch_size=1,
            gradient_accumulation_steps=8,
            max_grad_norm= 3,#0.3,
            optim='paged_adamw_32bit',
            lr_scheduler_type="cosine",
            num_train_epochs=3,
            weight_decay=0.0001,
            evaluation_strategy="steps",
            save_strategy="steps",
            save_steps = 100,
            eval_steps = 100,
            logging_steps= 100,
            load_best_model_at_end=True,
            push_to_hub=False,
            warmup_steps=10,
            report_to='none' # if DEBUG else 'wandb',
        )

        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_tokenized_df,
            eval_dataset=valid_tokenized_df,
            tokenizer=tokenizer,
            data_collator=data_collator,
            #compute_metrics=compute_metrics,
        )

        trainer.train()

        print(f"Fold {fold} ----------------------------------- VALIDATING -----------------------------------")
        valid_preds = trainer.predict(valid_tokenized_df)
        valid_preds = softmax(valid_preds.predictions)
        np.save(f"outputs/fold{fold}/valid_preds.npy", valid_preds)

    del trainer, model, base_model
                


### Inference

In [None]:
def inference():
    test_dataset = Dataset.from_pandas(test)
    test_tokenized_df = test_dataset.map(preprocess_function, batched=True)
    label_cols = [f'label_{i}' for i in range(Cfg.num_labels)]
    checkpoints = [f"outputs/fold{fold}/checkpoint-{Cfg.steps}" for fold in range(5)]
    test_preds = []

    for fold, checkpoint in enumerate(checkpoints):
        print(f"Fold {fold} ----------------------------------- TESTING -----------------------------------")
        bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_use_double_quant=True,
            bnb_4bit_compute_dtype=torch.bfloat16
        )
        base_model = LlamaForSequenceClassification.from_pretrained(
            Cfg.model_name,
            Cfg.num_labels,
            quantization_config = bnb_config,
            device_map = {"":0}
        )

        base_model.config.pretraining_tp = 1 # 1 for 7B
        base_model.config.pad_token_id = tokenizer.pad_token_id

        model = PeftModel(base_model, get_peft_config(checkpoint))

        trainer = Trainer(
            model = model,
            tokenizer = tokenizer,
            data_collator = data_collator,
        )

        # valid 
        valid_df = train[train['fold'] == fold]
        idxs = valid_df.index
        valid_dataset = Dataset.from_pandas(valid_df)
        valid_tokenized_df = valid_dataset.map(preprocess_function, batched=True)

        valid_preds = trainer.predict(valid_tokenized_df)
        valid_preds = softmax(valid_preds.predictions)
        train.loc[idxs, label_cols] = valid_preds

        # test

        preds = trainer.predict(test_tokenized_df)
        preds = softmax(preds.predictions)
        test_preds.append(preds)

    del trainer, model, base_model

    test[label_cols] = np.mean(test_preds, axis=0)
    test[['id'] + label_cols].to_csv('experiment_1.csv', index=False)

    print("oof_logloss:", log_loss(train['label'], train[label_cols].values))



In [None]:
if __name__ == "__main__":
    train()
    inference()