# üßë‚Äçüíª Introducci√≥n a MLFLow (Parte II): Tracking de Modelos de Lenguaje (LLMs).
Integrantes: Tob√≠as Romero **(2021214011)** y Jenifer Roa **(2022214006)**
---

## 1. Importaci√≥n de librer√≠as.

In [None]:
import warnings
warnings.filterwarnings('ignore')

import time
import json
import os
from datetime import datetime

import mlflow
import mlflow.pyfunc
from mlflow.models import infer_signature
from mlflow.tracking import MlflowClient

import google.generativeai as genai
from openai import OpenAI

from dotenv import load_dotenv

## 2. Configuraci√≥n de API Keys y DagsHub/MLflow.

In [None]:
# Cargar .env
load_dotenv()

# --- API Keys ---
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY")
genai.configure(api_key=GOOGLE_API_KEY)
openrouter_client = OpenAI(base_url="https://openrouter.ai/api/v1", api_key=OPENROUTER_API_KEY)

# --- DagsHub + MLflow remoto ---
import dagshub
dagshub.init(repo_owner='jenifer8092', repo_name='Laboratorio-MLFLOW', mlflow=True)

# (Opcional) Nombre del experimento por ENV
experiment_name = os.getenv("EXPERIMENT_NAME", "LLM_Comparison_Gemini_vs_Deepseek")

descripcion = """
Comparaci√≥n de modelos LLM (Gemini vs DeepSeek).
Incluye: prompts controlados, datasets de prueba y m√©tricas autom√°ticas
(latencia, tokens y coste estimado por token).
"""
tags_exp = {
    "owner": "Tob√≠as Romero",
    "proyecto": "MLOps",
    "model_family": "LLM",
    "providers": "Gemini, DeepSeek(OpenRouter)",
    "tracking": "notebook",
}

client = MlflowClient()
exp = mlflow.get_experiment_by_name(experiment_name)
if exp and getattr(exp, "lifecycle_stage", None) == "deleted":
    client.restore_experiment(exp.experiment_id)
    exp = mlflow.get_experiment_by_name(experiment_name)
if exp is None:
    exp_id = client.create_experiment(experiment_name, tags=tags_exp)
else:
    exp_id = exp.experiment_id
    for k, v in tags_exp.items():
        client.set_experiment_tag(exp_id, k, v)
client.set_experiment_tag(exp_id, "mlflow.note.content", descripcion)
mlflow.set_experiment(experiment_name)

print(f"‚úì Tracking URI: {mlflow.get_tracking_uri()}")
exp_actualizado = mlflow.get_experiment(exp_id)
print(f"‚úì Experimento: {exp_actualizado.name} | ID: {exp_actualizado.experiment_id}")
print(f"‚úì Artifacts: {exp_actualizado.artifact_location}")
print(f"‚úì Tags: {exp_actualizado.tags}")

## 3. Definici√≥n de tareas y prompts.

In [None]:
TASKS = {
    "creative_writing": {
        "prompt": "Escribe un cuento corto de ciencia ficci√≥n sobre un robot que aprende a sentir emociones. M√°ximo 200 palabras.",
        "description": "Tarea de escritura creativa y narrativa"
    },
    "code_generation": {
        "prompt": "Genera una funci√≥n en Python que implemente el algoritmo de b√∫squeda binaria con comentarios explicativos.",
        "description": "Generaci√≥n de c√≥digo con documentaci√≥n"
    },
    "question_answering": {
        "prompt": "Explica qu√© es el aprendizaje por refuerzo en machine learning y proporciona un ejemplo pr√°ctico de su aplicaci√≥n.",
        "description": "Respuesta a preguntas t√©cnicas"
    },
    "summarization": {
        "prompt": "Resume los principios fundamentales de la programaci√≥n orientada a objetos en 5 puntos clave.",
        "description": "Resumen y s√≠ntesis de informaci√≥n"
    },
    "translation": {
        "prompt": "Traduce el siguiente texto al ingl√©s de manera natural: 'El machine learning ha revolucionado la forma en que procesamos y analizamos grandes vol√∫menes de datos en tiempo real.'",
        "description": "Traducci√≥n de texto t√©cnico"
    }
}
print("Tareas definidas:")
for task_name, task_info in TASKS.items():
    print(f"  ‚Ä¢ {task_name}: {task_info['description']}")

## 4. Helpers: costos, logging de artifacts.

In [None]:
# Costos por 1K tokens (simulados o por ENV)
COSTS = {
    "gemini": {
        "in": float(os.getenv("GEMINI_IN_COST_PER_1K", "0.00")),
        "out": float(os.getenv("GEMINI_OUT_COST_PER_1K", "0.00")),
    },
    "deepseek": {
        "in": float(os.getenv("DEEPSEEK_IN_COST_PER_1K", "0.00")),
        "out": float(os.getenv("DEEPSEEK_OUT_COST_PER_1K", "0.00")),
    }
}

def estimate_cost_usd(provider_key, input_tokens, output_tokens):
    rates = COSTS.get(provider_key, {"in":0.0, "out":0.0})
    return (input_tokens/1000.0)*rates["in"] + (output_tokens/1000.0)*rates["out"]

def save_artifacts(task_name, prompt, response_data, model_name, temperature, artifact_root="llm_runs"):
    """Sube artifacts directamente al artifact store del run."""
    assert mlflow.active_run() is not None, "Debe haber un mlflow.start_run() activo."
    base = f"{artifact_root}/{task_name}"
    mlflow.log_text(f"TAREA: {task_name}\n{'='*80}\n\n{prompt}\n", artifact_file=f"{base}/prompt.txt")
    mlflow.log_text(
        f"MODELO: {model_name}\nTEMPERATURA: {temperature}\nLATENCIA: {response_data['latency']:.3f}s\n"
        f"INPUT_TOKENS: {response_data['input_tokens']}\nOUTPUT_TOKENS: {response_data['output_tokens']}\n"
        + "="*80 + f"\n\n{response_data['response']}\n",
        artifact_file=f"{base}/response.txt"
    )
    mlflow.log_dict({
        "model": model_name,
        "task": task_name,
        "temperature": temperature,
        "prompt": prompt,
        "response": response_data['response'],
        "latency_seconds": response_data['latency'],
        "input_tokens": response_data['input_tokens'],
        "output_tokens": response_data['output_tokens'],
        "total_tokens": response_data['input_tokens'] + response_data['output_tokens'],
        "timestamp": datetime.now().isoformat(),
    }, artifact_file=f"{base}/experiment.json")


## 5. Llamados a modelos LLMs (parametrizados).

In [None]:
def call_gemini(prompt, temperature=0.7, model_name="gemini-2.0-flash-exp"):
    model = genai.GenerativeModel(model_name)
    start_time = time.time()
    response = model.generate_content(
        prompt,
        generation_config=genai.types.GenerationConfig(temperature=temperature)
    )
    latency = time.time() - start_time
    text = getattr(response, 'text', '')
    if hasattr(response, 'usage_metadata') and response.usage_metadata:
        input_tokens = int(response.usage_metadata.prompt_token_count or 0)
        output_tokens = int(response.usage_metadata.candidates_token_count or 0)
    else:
        # Estimaci√≥n simple si el SDK no expone usage
        input_tokens = int(len(prompt.split()) * 1.3)
        output_tokens = int(len(text.split()) * 1.3)
    return {"response": text, "latency": latency, "input_tokens": input_tokens, "output_tokens": output_tokens}

def call_deepseek(prompt, temperature=0.7, model_name="deepseek/deepseek-chat"):
    start_time = time.time()
    completion = openrouter_client.chat.completions.create(
        model=model_name,
        messages=[{"role": "user", "content": prompt}],
        temperature=temperature
    )
    latency = time.time() - start_time
    response_text = completion.choices[0].message.content
    usage = getattr(completion, 'usage', None)
    input_tokens = int(getattr(usage, 'prompt_tokens', 0))
    output_tokens = int(getattr(usage, 'completion_tokens', 0))
    # Si falta usage, estimaci√≥n simple
    if (input_tokens + output_tokens) == 0:
        input_tokens = int(len(prompt.split()) * 1.3)
        output_tokens = int(len(response_text.split()) * 1.3)
    return {"response": response_text, "latency": latency, "input_tokens": input_tokens, "output_tokens": output_tokens}


## 6. PythonModel stub (para registrar en el Model Registry).

In [None]:
class LLMStubModel(mlflow.pyfunc.PythonModel):
    def __init__(self, provider: str, model_name: str):
        self.provider = provider
        self.model_name = model_name

    def predict(self, context, model_input):
        # Este stub solo documenta c√≥mo invocar el LLM real externamente.
        # Devuelve un DF/Serie con un mensaje de placeholder por fila.
        if hasattr(model_input, 'to_dict'):
            n = len(model_input)
        else:
            try:
                n = len(model_input)
            except Exception:
                n = 1
        return [f"Use provider={self.provider} model={self.model_name} para inferencia de LLM en producci√≥n."] * n


## 7. Runner de experimentos (con logging y registro en Registry).

In [None]:
def run_experiment(model_type, model_name, task_name, task_info, temperature=0.7):
    run_name = f"{model_type}_{task_name}"
    prompt = task_info["prompt"]
    provider = "Google AI" if model_type == "gemini" else "OpenRouter"
    provider_key = "gemini" if model_type == "gemini" else "deepseek"

    print(f"\n{'='*80}\nEXPERIMENTO: {run_name}\nTarea: {task_info['description']}\n{'='*80}\n\n PROMPT:\n{prompt}\n")

    with mlflow.start_run(run_name=run_name):
        try:
            if model_type == "gemini":
                result = call_gemini(prompt, temperature, model_name)
            else:
                result = call_deepseek(prompt, temperature, model_name)
            success = True
        except Exception as e:
            print(f"Error: {str(e)}\n")
            result = {"response": f"ERROR: {str(e)}", "latency": 0, "input_tokens": 0, "output_tokens": 0}
            success = False

        print("="*80 + "\nRESPUESTA:\n" + "="*80)
        print(result['response'])
        print("="*80 + "\n")

        # Params
        mlflow.log_param("model_name", model_name)
        mlflow.log_param("model_type", model_type)
        mlflow.log_param("temperature", temperature)
        mlflow.log_param("task_type", task_name)
        mlflow.log_param("task_description", task_info["description"])
        mlflow.log_param("provider", provider)

        # M√©tricas
        total_tokens = result['input_tokens'] + result['output_tokens']
        words_per_second = (len(result['response'].split()) / result['latency']) if result['latency'] > 0 else 0
        est_cost = estimate_cost_usd(provider_key, result['input_tokens'], result['output_tokens'])

        mlflow.log_metric("latency_seconds", result['latency'])
        mlflow.log_metric("input_tokens", result['input_tokens'])
        mlflow.log_metric("output_tokens", result['output_tokens'])
        mlflow.log_metric("total_tokens", total_tokens)
        mlflow.log_metric("response_length_chars", len(result['response']))
        mlflow.log_metric("words_per_second", words_per_second)
        mlflow.log_metric("estimated_cost_usd", est_cost)
        mlflow.log_metric("success", 1 if success else 0)

        # Artifacts
        save_artifacts(task_name, prompt, result, model_name, temperature)

        # Tags
        mlflow.set_tag("model_family", "LLM")
        mlflow.set_tag("provider", provider)
        mlflow.set_tag("model_type", model_type)
        mlflow.set_tag("task_category", task_name)
        mlflow.set_tag("language", "espa√±ol")
        mlflow.set_tag("status", "success" if success else "failed")
        mlflow.set_tag("latency_tier", "fast" if result['latency'] < 2 else ("medium" if result['latency'] < 5 else "slow"))

        # Nota del run
        mlflow.set_tag("mlflow.note.content", f"LLM {model_name} ({provider}) | Tarea: {task_info['description']} | Temp={temperature} | Lat={result['latency']:.3f}s | Tokens={total_tokens} | Cost‚âà${est_cost:.4f}")

        # Registrar un stub en el Model Registry (crea/actualiza versi√≥n)
        registry_name = f"llm_{model_type}_chat"
        mlflow.pyfunc.log_model(
            artifact_path="model",
            python_model=LLMStubModel(provider=provider, model_name=model_name),
            registered_model_name=registry_name,
            pip_requirements=["pandas"]
        )

        print("Experimento registrado en MLflow")
        print(f"Registry: {registry_name}\n")


## 8. Ejecutar suite de tareas para cada modelo.

In [None]:
GEMINI_MODEL = os.getenv("GEMINI_MODEL", "gemini-2.0-flash-exp")
DEEPSEEK_MODEL = os.getenv("DEEPSEEK_MODEL", "deepseek/deepseek-chat")
TEMPERATURE = float(os.getenv("LLM_TEMPERATURE", "0.7"))

for task_name, task_info in TASKS.items():
    run_experiment(
        model_type="gemini",
        model_name=GEMINI_MODEL,
        task_name=task_name,
        task_info=task_info,
        temperature=TEMPERATURE
    )
    time.sleep(1)

for task_name, task_info in TASKS.items():
    run_experiment(
        model_type="deepseek",
        model_name=DEEPSEEK_MODEL,
        task_name=task_name,
        task_info=task_info,
        temperature=TEMPERATURE
    )
    time.sleep(1)
