# Run inference on MistrAND 7B

## Preparation

In [None]:
# Installations 
!pip install -q -U bitsandbytes
!pip install -q -U git+https://github.com/huggingface/transformers.git
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q -U roman
!pip install -q -U torch
!pip install -q -U huggingface_hub
!pip install -q -U ipywidgets
!pip install -q -U git+https://github.com/huggingface/accelerate.git

# Load main model

### Imports

In [None]:
# Modelling
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel
import accelerate

# Conversor
from src.convert_to_andalusian_spanish import AndalusianConversor

# Hugging Face login
from huggingface_hub import notebook_login

### Login to Hugging Face

In [None]:
# Log into HuggingFace
# Alternatively, use !huggingface-cli login --token 
notebook_login()

### Load model

First, load the base model.

In [None]:
base_model_id = "mistralai/Mistral-7B-v0.1"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

base_model = AutoModelForCausalLM.from_pretrained(
    base_model_id,  # Mistral, same as before
    quantization_config=bnb_config,  # Same quantization config as before
    device_map="auto",
    trust_remote_code=True,
)

eval_tokenizer = AutoTokenizer.from_pretrained(base_model_id, add_bos_token=True, trust_remote_code=True)

Then, load the QLoRA adapter from Hugging Face:

In [None]:
ft_model = PeftModel.from_pretrained(base_model, "jgchaparro/MistrAND-7B-v1")

# Run inference

In [8]:
# Set text to convert to Andalusian Spanish
text_to_convert = '¡Hola! Esto es una prueba'

In [10]:
# Convert text to Andalusian Spanish
conversor = AndalusianConversor()
converted_text = conversor.convert(text_to_convert)
print(converted_text)

¡Ola! Eьʌo eь una prueбa


In [None]:
# Run inference
def run_inference(text: str,
                  n_max_tokens: int = 500):
    """
    Runs inference on MistrAND-7B-v1 model.
    
    Args:
        text (str): Andalusian Spanish text use as input for the model.
        n_max_tokens (int): Maximum number of tokens to generate.

    Returns:
        str: Generated text by the model.
    """
    model_input = eval_tokenizer(text, return_tensors="pt").to("cuda")

    ft_model.eval()
    with torch.no_grad():
        result = eval_tokenizer.decode(ft_model.generate(**model_input, 
                                                         max_new_tokens = n_max_tokens, 
                                                         repetition_penalty=1.15)[0], 
                                        skip_special_tokens=True)

    return result

In [None]:
# Show output
output = run_inference(converted_text)
print(output)