<a href="https://colab.research.google.com/github/juliovicenzi/Benchmarks/blob/master/axur.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Axur selection process


In [25]:
!pip install transformers=='4.49.0' datasets=='3.4.1' peft=='0.14.0' mlflow=='2.21.0' evaluate=='0.4.0' bert_score=='0.3.13'



## 1. Data collection

In [68]:
from datasets import load_dataset, DatasetDict

RANDOM_SEED=42

def load_alpaca_dataset(
        num_samples: int = 6000,
        test_size: float = 0.5,
        seed: int = RANDOM_SEED
    ) -> DatasetDict:
    """Reads the alpaca dataset from huggingface,
    and splits it into train and test.

    Args:
        num_samples: the number of rows to retrieve from the dataset
        test_size: the train/test split. Must be in [0,1]
        seed: random seed to use during shuffling
    Returns:
        the downloaded dataset split into train and test
    """
    alpaca_data = load_dataset(
        "tatsu-lab/alpaca",
        # alpaca does not contain a test split by default
        split=f"train[:{num_samples}]"
    )

    shuffled_split_data = alpaca_data.train_test_split(
        test_size=0.5,
        shuffle=True,
        seed=RANDOM_SEED
    )
    return shuffled_split_data

data = load_alpaca_dataset()

In [4]:
data

DatasetDict({
    train: Dataset({
        features: ['instruction', 'input', 'output', 'text'],
        num_rows: 3000
    })
    test: Dataset({
        features: ['instruction', 'input', 'output', 'text'],
        num_rows: 3000
    })
})

Next, we will retrieve gpt2-small model from huggingface, and test it on a random prompt.

In [69]:
from transformers import AutoTokenizer, TrainingArguments, Trainer, GPT2LMHeadModel, GPT2Tokenizer

model = GPT2LMHeadModel.from_pretrained("gpt2")
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

prompt = tokenizer("What is the capital of Germany?", return_tensors='pt', padding=True)
res = model.generate(**prompt)
tokenizer.batch_decode(res)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


['What is the capital of Germany?\n\nThe capital of Germany is Berlin.\n\nThe capital of Germany is Berlin.\n\n']

In [27]:
from transformers import pipeline, set_seed
generator = pipeline('text-generation', model=model, tokenizer=tokenizer)
set_seed(RANDOM_SEED)
generator("What is the capital of Germany?")

Device set to use cuda:0


TypeError: can only concatenate str (not "Dataset") to str

In [80]:
from functools import partial
import torch

def preprocess_data(sample: dict, tokenizer: GPT2Tokenizer) -> dict:
    prompt = f"""Instruction: {sample["instruction"]}
    Input: {sample["input"]}
    Response:
    """
    tokenized_prompt = tokenizer(prompt, return_tensors='pt', padding="max_length", max_length=128, truncation=True)
    tokenized_response = tokenizer(sample["output"], return_tensors='pt', padding="max_length", max_length=128, truncation=True)
    # ensure all tensors are using the same dtypes
    dtype = torch.float32
    return {"prompt" : tokenized_prompt["input_ids"].type(dtype),
            "response": tokenized_response["input_ids"].type(dtype)}

tokenized_train = data["train"].map(
    partial(preprocess_data, tokenizer=tokenizer),
    remove_columns=data["train"].column_names
)
tokenized_test = data["test"].map(
    partial(preprocess_data, tokenizer=tokenizer),
    remove_columns=data["test"].column_names
)

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

In [26]:
from evaluate import load

bertscore = load("bertscore")
precision = load("precision")
accuracy = load("accuracy")


def evaluate_model(model, eval_data):
    bertscore.compute(predictions=predictions, references=[""], lang="en")

In [16]:
from peft import LoraConfig, get_peft_model, TaskType
lora_config = LoraConfig(
    r=8, # Rank of the update matrices
    lora_alpha=32, # Scaling factor
    lora_dropout=0.05, # Dropout rate for LoRA layers
    bias="none", # Bias type for LoRA layers
    task_type=TaskType.CAUSAL_LM # Task type
)
ft_model = get_peft_model(model, lora_config)



In [17]:
ft_model.print_trainable_parameters()

trainable params: 294,912 || all params: 124,734,720 || trainable%: 0.2364


In [None]:
def fine_tune_model(model, train_data, eval_data, epochs: int, learning_rate: float):
    pass

# 3. How would you group questions from the train and test datasets that deal with similar subjects?

First, we can use sentence embedding to get vector representations of the questions.

From the embedding, we can perform similarity grouping either using *similarity search*, like [FASS](https://ai.meta.com/tools/faiss/).

A more traditional but apporach could make use K-means, or any other clustering algorithm to find similarities in the questions based on their sentence embeddings.