## Generación de mensajes a través de API Nvidia

In [None]:
from openai import OpenAI 

client = OpenAI(
  base_url = "https://integrate.api.nvidia.com/v1",
  api_key = ""
)

instruccion = """"
Realiza un pitch deck para la marca Nvidia con únicamente un solo texto plano resumido, sin introducción ni nada, solo el resumen 
"""

completion = client.chat.completions.create(
  model="marin/marin-8b-instruct",
  messages=[{"role":"user","content":instruccion}],
  temperature=0.7,
  top_p=0.9,
  max_tokens=512,
  stream=True
)

for chunk in completion:
  if chunk.choices[0].delta.content is not None:
    print(chunk.choices[0].delta.content, end="")

print()

NVIDIA: Poder en tu mano

NVIDIA revoluciona la tecnología con soluciones avanzadas en inteligencia artificial, aprendizaje profundo, gaming, y gráficos, llevándola al futuro.


Leer los archivos limpios: 

In [3]:
import pandas as pd 
df = pd.read_json("./data_cleaned.json")
print("Tamaño de muestra: ",df.shape[0])
df.head()

Tamaño de muestra:  230


Unnamed: 0,category_name,blurb,name,backers_count,staff_pick
8,3D Printing,Organize anything with our innovative 3D print...,Thread Boards 2.0 | 3D printable peg boards wi...,2389,True
53,Animation,Alyson confronts her perfectionism as she stru...,Alyssum - A 2D Animated Short Film,75,True
55,Animation,A boy wonders if there might be other people l...,Is There Anyone Out There?,140,True
56,Animation,This will be the world's first feature-length ...,Loving Vincent Film - bring Van Gogh paintings...,796,True
57,Animation,A student animated musical short in which a ha...,Hamstercide: An Animated Musical Short,26,True


In [4]:
categorias_unicas = df['category_name'].value_counts().index.tolist()
print(categorias_unicas)
len(categorias_unicas)

['Anthologies', 'Product Design', 'Art Books', 'Robots', 'Animation', 'Cookbooks', 'Calendars', 'Literary Spaces', 'Music', 'Periodicals', 'Pottery', 'Indie Rock', 'Illustration', 'Comics', 'Drinks', 'Performances', 'Journalism', 'Publishing', 'Public Art', 'Social Practice', 'Stationery', 'Radio & Podcasts', 'Nonfiction', 'Comic Books', 'Audio', 'Fiction', 'Embroidery', 'Classical Music', 'Theater', 'Typography', 'Video Games', 'Electronic Music', 'Games', 'Glass', 'Graphic Novels', 'Gadgets', 'Dance', 'Design', 'Documentary', 'Flight', 'Residencies', 'Sound', 'Shorts', 'Puzzles', "Children's Books", 'Comedy', 'Chiptune', 'Zines', 'Young Adult', 'Webcomics', 'Weaving', 'World Music', 'Print', 'Letterpress', 'Taxidermy', 'Photography', 'Movie Theaters', 'Pop', 'Photobooks', 'Mobile Games', 'Metal', 'Installations', 'Food', 'DIY', 'Ceramics', '3D Printing', 'Architecture', 'Apps', 'Civic Design', 'Childrenswear', 'Footwear', "Farmer's Markets", 'Fashion', 'Country & Folk', 'Conceptual A

111

In [5]:
df.columns

Index(['category_name', 'blurb', 'name', 'backers_count', 'staff_pick'], dtype='object')

## Creación de la base de datos con prompt genérico: 

In [7]:
from openai import OpenAI
import pandas as pd

models_config = {
    "marin/marin-8b-instruct": {
        "params": {
            "temperature": 0.7,
            "top_p": 0.9,
            "max_tokens": 512
        },
        "extract_func": lambda chunk: chunk.choices[0].delta.content
    },
    "deepseek-ai/deepseek-r1": {
        "params": {
            "temperature": 0.6,
            "top_p": 0.7,
            "max_tokens": 4096
        },
        "extract_func": lambda chunk: chunk.choices[0].delta.content
    },
    "qwen/qwen3-235b-a22b": {
        "params": {
            "temperature": 0.2,
            "top_p": 0.7,
            "max_tokens": 8192,
            "extra_body": {"chat_template_kwargs": {"thinking": True}}
        },
        "extract_func": lambda chunk: (
            getattr(chunk.choices[0].delta, "reasoning_content", "") or ""
        ) + (chunk.choices[0].delta.content or "")
    }
}

num_of_samples = len(df)
rows = []

# Iterar sobre modelos con prompt genérico
for model_name, model_config in models_config.items():
    print(f"\nUsando modelo: {model_name}")

    for i in range(num_of_samples):
        try:
            category = df.iloc[i]['category_name']
            name = df.iloc[i]['name']
            original_text = df.iloc[i]['blurb']

            instruction = f"""
Summarize the essence of the brand {category} and the project {name} in a single, compact paragraph, conveying its core idea clearly and directly.
            """

            completion = client.chat.completions.create(
                model=model_name,
                messages=[{"role": "user", "content": instruction}],
                stream=True,
                **model_config["params"]
            )

            full_text = ""
            for chunk in completion:
                fragment = model_config["extract_func"](chunk)
                if fragment:
                    full_text += fragment

            rows.append({
                'category_name': category,
                'original_text': original_text,
                'created_text': full_text,
                'name': name,
                'model_name': model_name,
                'prompt_type': 'generic'  
            })

            if (i % 20) == 0: 
                print(f"Pitch {i+1}/{num_of_samples} generado.")

        except Exception as e:
            print(f"Error con modelo {model_name}, muestra {i+1}: {e}")

# Crear DataFrame final
df_generated = pd.DataFrame(rows)


Usando modelo: marin/marin-8b-instruct
Pitch 1/230 generado.
Pitch 21/230 generado.
Pitch 41/230 generado.
Pitch 61/230 generado.
Pitch 81/230 generado.
Pitch 101/230 generado.
Pitch 121/230 generado.
Pitch 141/230 generado.
Pitch 161/230 generado.
Pitch 181/230 generado.
Pitch 201/230 generado.
Pitch 221/230 generado.

Usando modelo: deepseek-ai/deepseek-r1
Pitch 1/230 generado.
Pitch 21/230 generado.
Pitch 41/230 generado.
Pitch 61/230 generado.
Pitch 81/230 generado.
Pitch 101/230 generado.
Pitch 121/230 generado.
Pitch 141/230 generado.
Pitch 161/230 generado.
Pitch 181/230 generado.
Pitch 201/230 generado.
Pitch 221/230 generado.

Usando modelo: qwen/qwen3-235b-a22b
Pitch 1/230 generado.
Pitch 21/230 generado.
Pitch 41/230 generado.
Pitch 61/230 generado.
Pitch 81/230 generado.
Pitch 101/230 generado.
Pitch 121/230 generado.
Pitch 141/230 generado.
Pitch 161/230 generado.
Pitch 181/230 generado.
Pitch 201/230 generado.
Pitch 221/230 generado.


In [10]:
df_generated.head()
print("Longitud del dataset generado: ", df_generated.shape[0])

Longitud del dataset generado:  690


In [13]:
# Ejemplo de texto creado
df_generated.iloc[0]['created_text']

#Guardar el DataFrame generado en csv
df_generated.to_csv("generic_pitches.csv", index=False)

# Creación de la base de datos con prompt estructurado: 

In [15]:
structured_rows = []  

for model_name, model_config in models_config.items():
    print(f"\nUsando modelo: {model_name}")

    for i in range(num_of_samples):
        try:
            category_name = df.iloc[i]['category_name']
            name = df.iloc[i]['name']
            blurb = df.iloc[i]['blurb']
            backers_count = df.iloc[i]['backers_count']
            staff_pick = df.iloc[i]['staff_pick']
            # Para cambiar la instrucción necesitas dejar todo entre comillas
            # Las variable están dadas entre llaves y las puedes mover de lugar 
            instruction = f"""
            Generate a professional and structured pitch deck summary for a project titled **'{name}'**, which falls under the **'{category_name}'** category. This pitch should be a fluent and engaging extension of the original project description (blurb) provided below.

            Original Project Blurb:
            "{blurb}"

            **Instructions for Generation (Max: 150 words):**

            1. **Structure** (use headings and bullet points if needed):
            - **Problem (1–2 sentences):** Clearly describe the central problem addressed by the project. Include at least 2 keywords or phrases (2–4 words) directly from the blurb.
            - **Solution (2–3 sentences):** Explain how the project '{name}' provides a unique or effective solution. Reuse at least 3 phrases from the blurb and incorporate relevant synonyms to enhance lexical richness.
            - **Value Proposition (1 sentence):** Provide a compelling summary combining key nouns and verbs from the blurb, rephrased with synonyms or grammatical variations (e.g., "connect" → "connection", "automates" → "automation").

            2. **Lexical and Semantic Alignment Guidelines (to optimize similarity metrics):**
            - Aim for **at least 70% lexical overlap** with the blurb.
            - Preserve key **verbs, nouns, and adjectives**, while enriching the language with inflections or morphological variants.
            - Incorporate **multi-word expressions** (n-grams) from the blurb without repetition or redundancy.
            - Ensure the **semantic intent and core meaning remain unchanged**, even with paraphrasing or reordering.

            3. **Style**:
            - Use **natural, persuasive, and concise** language.
            - Avoid overly generic or vague phrases.
            - Ensure coherence and logical flow across sections.
            """

            completion = client.chat.completions.create(
                model=model_name,
                messages=[
                    {
                        "role": "system",
                        "content": "You are a professional pitch deck generator. Your task is to create compelling, structured summaries based on project blurbs, highlighting key challenges, solutions, and unique value propositions. Always aim for clarity, conciseness, and strong lexical alignment with the input."
                    },
                    {
                        "role": "user",
                        "content": instruction
                    }
                ],
                stream=True,
                **model_config["params"]
            )

            full_text = ""
            for chunk in completion:
                fragment = model_config["extract_func"](chunk)
                if fragment:
                    full_text += fragment

            structured_rows.append({
                'category_name': category_name,
                'original_text': blurb,
                'created_text': full_text,
                'name': name,
                'model_name': model_name,
                'prompt_type': 'structured' # Cambiarle a 'structured_n' 
            })

            if (i % 20) == 0: 
                print(f"Pitch {i+1}/{num_of_samples} generado.")

        except Exception as e:
            print(f"Error con modelo {model_name}, muestra {i+1}: {e}")


df_structured = pd.DataFrame(structured_rows) #Le vas a cambiar el nombre a la variable 


Usando modelo: marin/marin-8b-instruct
Pitch 1/230 generado.
Pitch 21/230 generado.
Pitch 41/230 generado.
Pitch 61/230 generado.
Pitch 81/230 generado.
Pitch 101/230 generado.
Pitch 121/230 generado.
Pitch 141/230 generado.
Pitch 161/230 generado.
Pitch 181/230 generado.
Pitch 201/230 generado.
Pitch 221/230 generado.

Usando modelo: deepseek-ai/deepseek-r1
Pitch 1/230 generado.
Pitch 21/230 generado.
Pitch 41/230 generado.
Pitch 61/230 generado.
Pitch 81/230 generado.
Pitch 101/230 generado.
Pitch 121/230 generado.
Pitch 141/230 generado.
Pitch 161/230 generado.
Pitch 181/230 generado.
Pitch 201/230 generado.
Pitch 221/230 generado.

Usando modelo: qwen/qwen3-235b-a22b
Pitch 1/230 generado.
Pitch 21/230 generado.
Pitch 41/230 generado.
Pitch 61/230 generado.
Pitch 81/230 generado.
Pitch 101/230 generado.
Error con modelo qwen/qwen3-235b-a22b, muestra 102: The read operation timed out
Pitch 121/230 generado.
Pitch 141/230 generado.
Pitch 161/230 generado.
Pitch 181/230 generado.
Pitc

In [16]:
#Guardar el DataFrame generado en csv
df_structured.to_csv("structured_pitches.csv", index=False)

## Combinando en un solo df el prompt genérico y estructurado: 

In [2]:
import pandas as pd
df_generated = pd.read_csv("generic_pitches.csv")
df_structured = pd.read_csv("structured_pitches.csv")

dataframes = [df_generated, df_structured]  # Agrega aquí todos los DataFrames que quieras combinar

df_combined = pd.concat(dataframes, axis=0, ignore_index=True) #Agregas todos los archivos de promt. por ejemplo df_structured_1


df_combined = df_combined.sort_values(by=['model_name', 'prompt_type']).reset_index(drop=True)


print(f"DataFrame combinado ({len(df_combined)} filas):")
print(df_combined[['model_name', 'prompt_type', 'name']].head(6))
# Guardar el DataFrame combinado en un archivo CSV
df_combined.to_csv('pitch_decks_combined.csv', index=False)

DataFrame combinado (1379 filas):
                model_name prompt_type  \
0  deepseek-ai/deepseek-r1     generic   
1  deepseek-ai/deepseek-r1     generic   
2  deepseek-ai/deepseek-r1     generic   
3  deepseek-ai/deepseek-r1     generic   
4  deepseek-ai/deepseek-r1     generic   
5  deepseek-ai/deepseek-r1     generic   

                                                name  
0  Thread Boards 2.0 | 3D printable peg boards wi...  
1                 Alyssum - A 2D Animated Short Film  
2                         Is There Anyone Out There?  
3  Loving Vincent Film - bring Van Gogh paintings...  
4             Hamstercide: An Animated Musical Short  
5          Release the Beast: An Animated Short Film  


## Uso y comparación semántica de los textos: 


### Rouge-L: 

In [3]:
from rouge_score import rouge_scorer

df_combined = pd.read_csv('pitch_decks_combined.csv')
print(f"DataFrame cargado ({len(df_combined)} filas):")
rouge_scores = []

for i in range(len(df_combined)): 
    original = df_combined.iloc[i]['original_text']
    created = df_combined.iloc[i]['created_text']

    # Crear el evaluador
    scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)

    # Calcular ROUGE-L
    scores = scorer.score(original, created)

    rouge_scores.append({
        'Precision': round(scores['rougeL'].precision, 3),
        'Recall': round(scores['rougeL'].recall, 3),
        'F1': round(scores['rougeL'].fmeasure, 3)
    })

df_combined['Rouge-L-Score'] = rouge_scores


DataFrame cargado (1379 filas):


In [19]:
df_combined.head()

Unnamed: 0,category_name,original_text,created_text,name,model_name,prompt_type,Rouge-L-Score
0,3D Printing,Organize anything with our innovative 3D print...,"<think>\nOkay, so I need to summarize the esse...",Thread Boards 2.0 | 3D printable peg boards wi...,deepseek-ai/deepseek-r1,generic,"{'Precision': 0.021, 'Recall': 0.526, 'F1': 0.04}"
1,Animation,Alyson confronts her perfectionism as she stru...,"<think>\nOkay, I need to summarize the essence...",Alyssum - A 2D Animated Short Film,deepseek-ai/deepseek-r1,generic,"{'Precision': 0.013, 'Recall': 0.227, 'F1': 0...."
2,Animation,A boy wonders if there might be other people l...,"<think>\nOkay, the user wants a summary of the...",Is There Anyone Out There?,deepseek-ai/deepseek-r1,generic,"{'Precision': 0.029, 'Recall': 0.308, 'F1': 0...."
3,Animation,This will be the world's first feature-length ...,"<think>\nOkay, I need to summarize the essence...",Loving Vincent Film - bring Van Gogh paintings...,deepseek-ai/deepseek-r1,generic,"{'Precision': 0.022, 'Recall': 0.476, 'F1': 0...."
4,Animation,A student animated musical short in which a ha...,"<think>\nOkay, let's tackle this query. The us...",Hamstercide: An Animated Musical Short,deepseek-ai/deepseek-r1,generic,"{'Precision': 0.036, 'Recall': 0.429, 'F1': 0...."


### BLEU: 

In [20]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction


bleu_scores = []


smooth = SmoothingFunction().method4

for i in range(len(df_combined)):
    original = df_combined.iloc[i]['original_text']
    creado = df_combined.iloc[i]['created_text']

    # Tokenizar
    ref = [original.lower().split()]  # Referencia (lista de listas)
    hyp = creado.lower().split()      # Hipótesis (generado)


    score = sentence_bleu(ref, hyp, smoothing_function=smooth)


    bleu_scores.append(round(score, 3))


df_combined['BLEU'] = bleu_scores

In [21]:
df_combined.head()

Unnamed: 0,category_name,original_text,created_text,name,model_name,prompt_type,Rouge-L-Score,BLEU
0,3D Printing,Organize anything with our innovative 3D print...,"<think>\nOkay, so I need to summarize the esse...",Thread Boards 2.0 | 3D printable peg boards wi...,deepseek-ai/deepseek-r1,generic,"{'Precision': 0.021, 'Recall': 0.526, 'F1': 0.04}",0.002
1,Animation,Alyson confronts her perfectionism as she stru...,"<think>\nOkay, I need to summarize the essence...",Alyssum - A 2D Animated Short Film,deepseek-ai/deepseek-r1,generic,"{'Precision': 0.013, 'Recall': 0.227, 'F1': 0....",0.002
2,Animation,A boy wonders if there might be other people l...,"<think>\nOkay, the user wants a summary of the...",Is There Anyone Out There?,deepseek-ai/deepseek-r1,generic,"{'Precision': 0.029, 'Recall': 0.308, 'F1': 0....",0.004
3,Animation,This will be the world's first feature-length ...,"<think>\nOkay, I need to summarize the essence...",Loving Vincent Film - bring Van Gogh paintings...,deepseek-ai/deepseek-r1,generic,"{'Precision': 0.022, 'Recall': 0.476, 'F1': 0....",0.004
4,Animation,A student animated musical short in which a ha...,"<think>\nOkay, let's tackle this query. The us...",Hamstercide: An Animated Musical Short,deepseek-ai/deepseek-r1,generic,"{'Precision': 0.036, 'Recall': 0.429, 'F1': 0....",0.005


### Embeddings + Similitud coseno: 

In [22]:
from sentence_transformers import SentenceTransformer, util


model = SentenceTransformer('all-MiniLM-L6-v2')

semantic_similarities = []


for i in range(len(df_combined)):
    texto1 = df_combined.iloc[i]['original_text']
    texto2 = df_combined.iloc[i]['created_text']

    # Obtener los embeddings
    embeddings = model.encode([texto1, texto2], convert_to_tensor=True)

    # Calcular similitud de coseno
    similarity = util.pytorch_cos_sim(embeddings[0], embeddings[1]).item()

    semantic_similarities.append(round(similarity, 3))

df_combined['Semantic_Similarity'] = semantic_similarities

  from .autonotebook import tqdm as notebook_tqdm


In [23]:
df_combined.head()

Unnamed: 0,category_name,original_text,created_text,name,model_name,prompt_type,Rouge-L-Score,BLEU,Semantic_Similarity
0,3D Printing,Organize anything with our innovative 3D print...,"<think>\nOkay, so I need to summarize the esse...",Thread Boards 2.0 | 3D printable peg boards wi...,deepseek-ai/deepseek-r1,generic,"{'Precision': 0.021, 'Recall': 0.526, 'F1': 0.04}",0.002,0.668
1,Animation,Alyson confronts her perfectionism as she stru...,"<think>\nOkay, I need to summarize the essence...",Alyssum - A 2D Animated Short Film,deepseek-ai/deepseek-r1,generic,"{'Precision': 0.013, 'Recall': 0.227, 'F1': 0....",0.002,0.154
2,Animation,A boy wonders if there might be other people l...,"<think>\nOkay, the user wants a summary of the...",Is There Anyone Out There?,deepseek-ai/deepseek-r1,generic,"{'Precision': 0.029, 'Recall': 0.308, 'F1': 0....",0.004,0.134
3,Animation,This will be the world's first feature-length ...,"<think>\nOkay, I need to summarize the essence...",Loving Vincent Film - bring Van Gogh paintings...,deepseek-ai/deepseek-r1,generic,"{'Precision': 0.022, 'Recall': 0.476, 'F1': 0....",0.004,0.486
4,Animation,A student animated musical short in which a ha...,"<think>\nOkay, let's tackle this query. The us...",Hamstercide: An Animated Musical Short,deepseek-ai/deepseek-r1,generic,"{'Precision': 0.036, 'Recall': 0.429, 'F1': 0....",0.005,0.551


### Meteor: 


In [24]:
import nltk
from nltk.translate.meteor_score import meteor_score
from nltk.tokenize import word_tokenize

# Descargar recursos necesarios de NLTK 
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt')  # Para tokenización
nltk.download('punkt_tab')

meteor_scores = []

for i in range(len(df_combined)):
    try:
        texto_referencia = df_combined.iloc[i]['original_text']
        texto_hipotesis = df_combined.iloc[i]['created_text']
        
        tokens_ref = word_tokenize(str(texto_referencia).lower())
        tokens_hip = word_tokenize(str(texto_hipotesis).lower())
        
        puntuacion = meteor_score([tokens_ref], tokens_hip)
        meteor_scores.append(round(puntuacion, 3))
    
    except Exception as e:
        print(f"Error en fila {i}: {str(e)}")
        meteor_scores.append(None)  # Añadir None si hay error

df_combined['METEOR_Score'] = meteor_scores


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\bugy1\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\bugy1\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\bugy1\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\bugy1\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [27]:
df_combined.head()

Unnamed: 0,category_name,original_text,created_text,name,model_name,prompt_type,Rouge-L-Score,BLEU,Semantic_Similarity,METEOR_Score
0,3D Printing,Organize anything with our innovative 3D print...,"<think>\nOkay, so I need to summarize the esse...",Thread Boards 2.0 | 3D printable peg boards wi...,deepseek-ai/deepseek-r1,generic,"{'Precision': 0.021, 'Recall': 0.526, 'F1': 0.04}",0.002,0.668,0.119
1,Animation,Alyson confronts her perfectionism as she stru...,"<think>\nOkay, I need to summarize the essence...",Alyssum - A 2D Animated Short Film,deepseek-ai/deepseek-r1,generic,"{'Precision': 0.013, 'Recall': 0.227, 'F1': 0....",0.002,0.154,0.053
2,Animation,A boy wonders if there might be other people l...,"<think>\nOkay, the user wants a summary of the...",Is There Anyone Out There?,deepseek-ai/deepseek-r1,generic,"{'Precision': 0.029, 'Recall': 0.308, 'F1': 0....",0.004,0.134,0.141
3,Animation,This will be the world's first feature-length ...,"<think>\nOkay, I need to summarize the essence...",Loving Vincent Film - bring Van Gogh paintings...,deepseek-ai/deepseek-r1,generic,"{'Precision': 0.022, 'Recall': 0.476, 'F1': 0....",0.004,0.486,0.095
4,Animation,A student animated musical short in which a ha...,"<think>\nOkay, let's tackle this query. The us...",Hamstercide: An Animated Musical Short,deepseek-ai/deepseek-r1,generic,"{'Precision': 0.036, 'Recall': 0.429, 'F1': 0....",0.005,0.551,0.121


In [28]:
# Guardar como CSV 
df_combined.to_csv('pitches_combinados.csv', index=False)

# Resultados


## Genérico 

In [8]:
import pandas as pd
import ast

# Cargar datos
df = pd.read_csv("./pitches_combinados.csv")
print("Tamaño del DataFrame: ", df.shape[0])

tamanio_muestra = 0

print("-------------- Evaluaciones para prompts genéricos --------------")

# Modelos a evaluar
modelos = [
    "marin/marin-8b-instruct",
    "deepseek-ai/deepseek-r1",
    "qwen/qwen3-235b-a22b"
]

# Métricas a evaluar
metricas = ['BLEU', 'Semantic_Similarity', 'METEOR_Score', 'Rouge_Precision', 'Rouge_Recall', 'Rouge_F1']

for modelo in modelos:
    print(f"\n=== Evaluación para modelo: {modelo} ===")

    # Filtrar datos
    df_model = df[
        (df["model_name"] == modelo) & 
        (df["prompt_type"] == "generic")
    ].copy()

    if df_model.empty:
        print(f"No hay datos para modelo: {modelo} con prompt_type 'generic'")
        continue

    # Convertir Rouge-L-Score si está como string
    if isinstance(df_model['Rouge-L-Score'].iloc[0], str):
        df_model['Rouge-L-Score'] = df_model['Rouge-L-Score'].apply(ast.literal_eval)

    # Extraer métricas Rouge-L
    df_model['Rouge_Precision'] = df_model['Rouge-L-Score'].apply(lambda x: x.get('Precision', None))
    df_model['Rouge_Recall'] = df_model['Rouge-L-Score'].apply(lambda x: x.get('Recall', None))
    df_model['Rouge_F1'] = df_model['Rouge-L-Score'].apply(lambda x: x.get('F1', None))

    # Calcular promedios y desviaciones estándar
    promedios = df_model[metricas].mean()
    desv_std = df_model[metricas].std()

    print("\n>>> Promedios:")
    print(promedios.round(4))

    print("\n>>> Desviación estándar:")
    print(desv_std.round(4))
    tamanio_muestra += df_model.shape[0]

print("Tamaño del dataframe genérico: ", tamanio_muestra)


Tamaño del DataFrame:  1379
-------------- Evaluaciones para prompts genéricos --------------

=== Evaluación para modelo: marin/marin-8b-instruct ===

>>> Promedios:
BLEU                   0.0126
Semantic_Similarity    0.5067
METEOR_Score           0.1881
Rouge_Precision        0.0737
Rouge_Recall           0.2959
Rouge_F1               0.1135
dtype: float64

>>> Desviación estándar:
BLEU                   0.0159
Semantic_Similarity    0.1791
METEOR_Score           0.0757
Rouge_Precision        0.0400
Rouge_Recall           0.1208
Rouge_F1               0.0504
dtype: float64

=== Evaluación para modelo: deepseek-ai/deepseek-r1 ===

>>> Promedios:
BLEU                   0.0034
Semantic_Similarity    0.4688
METEOR_Score           0.1086
Rouge_Precision        0.0219
Rouge_Recall           0.4671
Rouge_F1               0.0416
dtype: float64

>>> Desviación estándar:
BLEU                   0.0027
Semantic_Similarity    0.1491
METEOR_Score           0.0436
Rouge_Precision        0.0087
Rou

## Structured 

In [9]:
import pandas as pd
import ast

# Cargar datos
df = pd.read_csv("./pitches_combinados.csv")

print("-------------- Evaluaciones para prompts estructurados --------------")

# Modelos a evaluar
modelos = [
    "marin/marin-8b-instruct",
    "deepseek-ai/deepseek-r1",
    "qwen/qwen3-235b-a22b"
]

# Métricas a evaluar
metricas = ['BLEU', 'Semantic_Similarity', 'METEOR_Score', 'Rouge_Precision', 'Rouge_Recall', 'Rouge_F1']

for modelo in modelos:
    print(f"\n=== Evaluación para modelo: {modelo} ===")

    # Filtrar datos
    df_model_2 = df[
        (df["model_name"] == modelo) & 
        (df["prompt_type"] == "structured")
    ].copy()

    if df_model_2.empty:
        print(f"No hay datos para modelo: {modelo} con prompt_type 'generic'")
        continue

    # Convertir Rouge-L-Score si está como string
    if isinstance(df_model_2['Rouge-L-Score'].iloc[0], str):
        df_model_2['Rouge-L-Score'] = df_model_2['Rouge-L-Score'].apply(ast.literal_eval)

    # Extraer métricas Rouge-L
    df_model_2['Rouge_Precision'] = df_model_2['Rouge-L-Score'].apply(lambda x: x.get('Precision', None))
    df_model_2['Rouge_Recall'] = df_model_2['Rouge-L-Score'].apply(lambda x: x.get('Recall', None))
    df_model_2['Rouge_F1'] = df_model_2['Rouge-L-Score'].apply(lambda x: x.get('F1', None))

    # Calcular promedios y desviaciones estándar
    promedios = df_model_2[metricas].mean()
    desv_std = df_model_2[metricas].std()

    print("\n>>> Promedios:")
    print(promedios.round(4))

    print("\n>>> Desviación estándar:")
    print(desv_std.round(4))


-------------- Evaluaciones para prompts estructurados --------------

=== Evaluación para modelo: marin/marin-8b-instruct ===

>>> Promedios:
BLEU                   0.0266
Semantic_Similarity    0.7004
METEOR_Score           0.3091
Rouge_Precision        0.0845
Rouge_Recall           0.6653
Rouge_F1               0.1476
dtype: float64

>>> Desviación estándar:
BLEU                   0.0232
Semantic_Similarity    0.1145
METEOR_Score           0.1056
Rouge_Precision        0.0321
Rouge_Recall           0.1794
Rouge_F1               0.0507
dtype: float64

=== Evaluación para modelo: deepseek-ai/deepseek-r1 ===

>>> Promedios:
BLEU                   0.0151
Semantic_Similarity    0.5388
METEOR_Score           0.1600
Rouge_Precision        0.0330
Rouge_Recall           0.9208
Rouge_F1               0.0634
dtype: float64

>>> Desviación estándar:
BLEU                   0.0091
Semantic_Similarity    0.0926
METEOR_Score           0.0530
Rouge_Precision        0.0113
Rouge_Recall           0.08