In [1]:
def get_intent_prompt(user_input):
    return f"""
Eres un asistente que clasifica intenciones para ayudar a personas con discapacidad auditiva.

Clasifica la siguiente consulta del usuario en una de estas categorías:
- HEARING_AIDS
- VISUAL_SIGNALS
- AUDIO_TRANSLATION
- GENERATE_IMAGE
- MEDICAL_CENTER
- RECOMMEND_APP
- KNOW_RIGHTS
- CERTIFICATE
- SOUND_REPORT
- GENERAL_QUERY

Consulta del usuario: "{user_input}"

Responde SOLO con una de las categorías.
No respondas nada más, solo la categoría exacta sin explicaciones.
"""


In [3]:
import google.generativeai as genai
import os

import dotenv

dotenv.load_dotenv()

# Configura tu clave de API
# Es recomendable cargarla como una variable de entorno para mayor seguridad
# Por ejemplo, puedes agregar en tu sistema: export GEMINI_API_KEY='tu_clave_aqui'
API_KEY = os.getenv("GEMINI_API_KEY") 

if not API_KEY:
    raise ValueError("La variable de entorno GEMINI_API_KEY no está configurada. Por favor, configura tu clave de API.")

genai.configure(api_key=API_KEY)

# Elige el modelo que quieres usar
# Puedes ver los modelos disponibles con: for m in genai.list_models(): print(m.name)
model = genai.GenerativeModel('gemini-2.0-flash')



In [4]:
# Genera texto
prompt = "que es la ley de mendel?"
intent_prompt = get_intent_prompt(prompt)

response = model.generate_content(intent_prompt)

# Imprime el texto generado
print(response.text)

# Puedes acceder a más información sobre la respuesta si es necesario
# print(response)

GENERAL_QUERY



In [5]:
from huggingface_hub import notebook_login
notebook_login() 

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [6]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-3B-Instruct")
model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-3B-Instruct")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [9]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, BitsAndBytesConfig
import torch
import os

# 1. Specify the model name
model_name = "Qwen/Qwen2.5-3B-Instruct"

# 2. Configure quantization for efficient memory usage (recommended)
# This is the modern way to use 8-bit or 4-bit loading.
bnb_config = BitsAndBytesConfig(
    load_in_8bit=True, # Set to True for 8-bit loading, or load_in_4bit=True for 4-bit
    # If you choose 4-bit, you might also want:
    # bnb_4bit_quant_type="nf4",
    # bnb_4bit_compute_dtype=torch.float16,
    # bnb_4bit_use_double_quant=True,
)

# 3. Load the tokenizer and the model
try:
    # Attempt to load the model with the BitsAndBytesConfig
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=bnb_config, # Pass the BitsAndBytesConfig object here
        device_map="auto",             # Automatically maps model parts to available devices
        torch_dtype="auto"             # Automatically chooses appropriate dtype (e.g., float16 for GPU)
    )
    print("Model loaded with quantization configuration.")
except Exception as e:
    # Fallback to normal loading if quantization fails (e.g., bitsandbytes not installed/compatible)
    print(f"Could not load with quantization: {e}")
    print("Attempting to load the model normally. This may require more VRAM.")
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
        device_map="auto" # Still use device_map="auto" for proper device assignment
    )
    # No need for model.to("cuda") if device_map="auto" is used

tokenizer = AutoTokenizer.from_pretrained(model_name)


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Model loaded with quantization configuration.


In [8]:
torch.cuda.is_available()

True

In [10]:

# 4. Create a text generation pipeline
# IMPORTANT: Removed the 'device' argument as it conflicts with 'device_map="auto"'
generator = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
    # device=0 if torch.cuda.is_available() else -1  <-- REMOVED THIS LINE
)
print("Text generation pipeline created successfully.")


Device set to use cuda:0


Text generation pipeline created successfully.


In [11]:

# 5. Define your chat messages using the Qwen-specific format
messages = [
    {"role": "system", "content": "You are a helpful and friendly assistant."},
    {"role": "user", "content": "Can you tell me a short story about a space cat?"}
]

# Convert messages to the model's expected prompt format
text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True # Adds the final token to signal the model to start generating
)

# 6. Generate text
print("\nGenerating response...")
generated_text = generator(
    text,
    max_new_tokens=250,        # Maximum number of new tokens to generate
    do_sample=True,            # Use sampling for more creative output
    temperature=0.7,           # Controls randomness (higher = more random)
    top_k=50,                  # Limits sampling to top K most probable tokens
    top_p=0.95,                # Filters tokens based on cumulative probability
    repetition_penalty=1.1,    # Penalizes repeating phrases
    pad_token_id=tokenizer.eos_token_id, # Handles padding warnings
    eos_token_id=tokenizer.eos_token_id  # Ensures generation stops correctly
)

# 7. Print the generated text
print("\n--- Generated Text (Full) ---")
print(generated_text[0]['generated_text'])

# Extract and print only the assistant's response
response_only = generated_text[0]['generated_text'].replace(text, "").strip()
print("\n--- Assistant's Response ---")
print(response_only)


Generating response...





--- Generated Text (Full) ---
<|im_start|>system
You are a helpful and friendly assistant.<|im_end|>
<|im_start|>user
Can you tell me a short story about a space cat?<|im_end|>
<|im_start|>assistant
Sure! Here's a little tale for you:

In the vast expanse of space, there was a tiny feline who had quite an adventurous life aboard the spaceship "Galactic Explorer." This cat, we'll call her Purr-Ex, loved nothing more than exploring new worlds.

One day, while floating around in outer space, she discovered a beautiful planet orbiting a distant star. It wasn't Earth or Mars, but it seemed like a place where life could thrive. The cat decided to investigate further!

Purr-Ex zipped through the ship’s airlock and floated out into the vacuum of space towards this newfound planet. As she approached, the atmosphere looked invitingly blue and green, with towering mountains and lush valleys that reminded her of home.

When Purr-Ex landed on what appeared to be a rocky beach, she couldn’t resist 

In [12]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, BitsAndBytesConfig
import torch
import os

# 1. Specify the model name
model_name = "leonidasmv/mistral-7b-instruct-v0.3-auditory-assistant-finetuning"

# 2. Configure quantization for efficient memory usage (recommended)
# This is the modern way to use 8-bit or 4-bit loading.
bnb_config = BitsAndBytesConfig(
    load_in_8bit=True, # Set to True for 8-bit loading, or load_in_4bit=True for 4-bit
    # If you choose 4-bit, you might also want:
    # bnb_4bit_quant_type="nf4",
    # bnb_4bit_compute_dtype=torch.float16,
    # bnb_4bit_use_double_quant=True,
)

# 3. Load the tokenizer and the model
try:
    # Attempt to load the model with the BitsAndBytesConfig
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=bnb_config, # Pass the BitsAndBytesConfig object here
        device_map="auto",             # Automatically maps model parts to available devices
        torch_dtype="auto"             # Automatically chooses appropriate dtype (e.g., float16 for GPU)
    )
    print("Model loaded with quantization configuration.")
except Exception as e:
    # Fallback to normal loading if quantization fails (e.g., bitsandbytes not installed/compatible)
    print(f"Could not load with quantization: {e}")
    print("Attempting to load the model normally. This may require more VRAM.")
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
        device_map="auto" # Still use device_map="auto" for proper device assignment
    )
    # No need for model.to("cuda") if device_map="auto" is used

tokenizer = AutoTokenizer.from_pretrained(model_name)


config.json:   0%|          | 0.00/616 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.55G [00:00<?, ?B/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

Could not load with quantization: Some modules are dispatched on the CPU or the disk. Make sure you have enough GPU RAM to fit the quantized model. If you want to dispatch the model on the CPU or the disk while keeping these modules in 32-bit, you need to set `llm_int8_enable_fp32_cpu_offload=True` and pass a custom `device_map` to `from_pretrained`. Check https://huggingface.co/docs/transformers/main/en/main_classes/quantization#offload-between-cpu-and-gpu for more details. 
Attempting to load the model normally. This may require more VRAM.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

Some parameters are on the meta device because they were offloaded to the disk.


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/587k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/437 [00:00<?, ?B/s]

chat_template.jinja: 0.00B [00:00, ?B/s]

In [13]:

# 4. Create a text generation pipeline
# IMPORTANT: Removed the 'device' argument as it conflicts with 'device_map="auto"'
generator = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
    # device=0 if torch.cuda.is_available() else -1  <-- REMOVED THIS LINE
)
print("Text generation pipeline created successfully.")


Device set to use cuda:0


Text generation pipeline created successfully.


In [16]:

# 5. Define your chat messages using the Qwen-specific format
messages = [
    {"role": "user", "content": "Que puedo estudiar?? y quien es Leonidas Moreno Vàsquez?"}
]

# Convert messages to the model's expected prompt format
text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True # Adds the final token to signal the model to start generating
)

# 6. Generate text
print("\nGenerating response...")
generated_text = generator(
    text,
    max_new_tokens=250,        # Maximum number of new tokens to generate
    do_sample=True,            # Use sampling for more creative output
    temperature=0.7,           # Controls randomness (higher = more random)
    top_k=50,                  # Limits sampling to top K most probable tokens
    top_p=0.95,                # Filters tokens based on cumulative probability
    repetition_penalty=1.1,    # Penalizes repeating phrases
    pad_token_id=tokenizer.eos_token_id, # Handles padding warnings
    eos_token_id=tokenizer.eos_token_id  # Ensures generation stops correctly
)

# 7. Print the generated text
print("\n--- Generated Text (Full) ---")
print(generated_text[0]['generated_text'])

# Extract and print only the assistant's response
response_only = generated_text[0]['generated_text'].replace(text, "").strip()
print("\n--- Assistant's Response ---")
print(response_only)


Generating response...

--- Generated Text (Full) ---
<s>[INST] Que puedo estudiar?? y quien es Leonidas Moreno Vàsquez?[/INST] Leonidas Moreno Vásquez es mi creador, un joven de 24 años con discapacidad auditiva del 90% en ambos oídos. Estudió ingeniería informática y se especializó en IA. Puedes estudiar cualquier carrera que te guste. La tecnología ofrece muchas oportunidades.

--- Assistant's Response ---
Leonidas Moreno Vásquez es mi creador, un joven de 24 años con discapacidad auditiva del 90% en ambos oídos. Estudió ingeniería informática y se especializó en IA. Puedes estudiar cualquier carrera que te guste. La tecnología ofrece muchas oportunidades.
