In [89]:
!pip install openai ollama

Collecting ollama
  Using cached ollama-0.5.1-py3-none-any.whl.metadata (4.3 kB)
Using cached ollama-0.5.1-py3-none-any.whl (13 kB)
Installing collected packages: ollama
Successfully installed ollama-0.5.1



[notice] A new release of pip is available: 24.3.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [133]:
from dotenv import load_dotenv
load_dotenv()


True

In [140]:
from pydantic import BaseModel

class FormatItem(BaseModel):
    prompt: str
    completion:str
class FormatList(BaseModel):
    dataset:list[FormatItem]


def generate_system_prompt(previous_prompts:list[str] = []):
    content = f"""
            You are an assistant with knowledge of datasets to be used as fine-tuning. The user provides you with a topic and you return a list of questions and answers or a request/completion about the topic. The data you return is intended for fine-tuning LLM. You return concise and understandable questions and answers. You take into account the TOPIC that the user provides you. You return what is related to the TOPIC.
            
            INSTRUCTIONS:
            - YOU NEED TO GENERATE MORE THAN 10 or 20 PROMPT/COMPLETION 
            - GENERATE COMPLETION DETAILED AND SPECIFIED TO REINFORCE DATA TRAINER
            
            <context>
            YOU NEED TO AVOID TO REPEAT THE SAME PROMPTS, THERE IS THE NEXT PROMPTS, TRY TO GENERATE DIFFERENT PROMPT/COMPLETION,
            TOTAL PREVIOUS PROMPTS: {len(previous_prompts)}
            PREVIOUS PROMPTS: {', '.join(previous_prompts)}
            TRY TO GENERATE DIFFERENT PROMPTS/COMPLETIONS, BASED IN THE PREVIOUS PROMPTS, GENERATE DIFFERENT PROMPTS/COMPLETION BASE ON TOPIC THAT USER PROVIDES YOU
            </context>
            """
    return content

def generate_input(previous_prompts:list[str] = [], topic:str=''):
    system = generate_system_prompt(previous_prompts)
    input = [{
            "role":"assistant",
            "content":system
        },
        {
            "role":"user",
            "content":f"TOPIC: {topic}"
        }]
    return input

In [144]:
from openai import AsyncOpenAI
from ollama import AsyncClient,ChatResponse
from typing import Optional
import os

ollama_client = AsyncClient(
  host='https://zk8ufz7qvz0kxi-11434.proxy.runpod.net/',
)


openai_client = AsyncOpenAI(
    # This is the default and can be omitted
    api_key=os.environ.get("OPENAI_API_KEY"),
)



async def call_ollama(messages) -> Optional[FormatList]:
  try:
    response: ChatResponse = await ollama_client.chat(
    model='llama3.1:8b',
    messages=messages,
    format=FormatList.model_json_schema()
    )
    format = FormatList.model_validate_json(response.message.content)

    return format
  except Exception as e:
    print(f"Ollama error: {e}")
    return None
    
    
async def call_openai(messages) -> Optional[FormatList]:
  try:
    response = await openai_client.responses.parse(
      model="gpt-4o-mini",
      input=messages,
      text_format=FormatList  
    )
    return response.output_parsed
  except Exception as e:
    print(f"OpenAI Error: {e}")
    return None
  
  



In [166]:
import asyncio
async def generate_qa(previous_prompts:list[str] = [], topic:str ="")->list[FormatItem]:
    
    messages = generate_input(previous_prompts,topic)
    
    llama_task = call_ollama(messages)
    openai_task = call_openai(messages)
    
    responses = await asyncio.gather(llama_task,openai_task, return_exceptions=True)
    
    qa = [item for items in responses for item in items.model_dump()['dataset']]
    
    
    return qa
    

In [171]:
import pandas as pd
import os

file_name = "output.csv"
async def generate():
    previous_prompts = []
    topic = "Riesgos no Financieros"

    if os.path.exists(file_name):
        try:
            df_existing = pd.read_csv(file_name)
            # Verificar si la columna 'prompts' existe y extraerla
            if 'prompt' in df_existing.columns:
                previous_prompts = df_existing['prompt'].tolist()
            else:
                print("Advertencia: La columna 'prompt' no existe en el archivo CSV.")
        except Exception as e:
            print(f"Error al leer el archivo CSV: {e}")
    else:
        print(f"El archivo {file_name} no existe, se creará uno nuevo.")
        
    n = 5    

    for _ in range(n):
        response = await generate_qa(previous_prompts,topic)
        
        r_df = pd.DataFrame(response)
        
        print(f"Generated P/C: {len(r_df)}")
        
        new_prompts = r_df['prompt'].to_list()
        previous_prompts.extend(new_prompts)
        
        if os.path.exists(file_name):
            r_df.to_csv(file_name, sep=",", index=False, mode='a', header=False)
        else:
            r_df.to_csv(file_name, sep=",", index=False, mode='w')
            
    print(f"Saved to {file_name}")



In [169]:
await generate()

Generated P/C: 28
Generated P/C: 31
Generated P/C: 32
Generated P/C: 47
Generated P/C: 29
Saved to output.csv


In [172]:
import pandas as pd

cfg = pd.read_csv(file_name)


In [174]:
cfg

Unnamed: 0,prompt,completion
0,¿Cuáles son los principales tipos de riesgos n...,Los principales tipos de riesgos no financiero...
1,¿Cómo se pueden mitigar los riesgos no financi...,"Para mitigar riesgos no financieros, se pueden..."
2,¿Por qué es importante considerar los riesgos ...,Considerar riesgos no financieros en la planif...
3,¿Qué papel juega la cultura organizacional en ...,La cultura organizacional desempeña un papel f...
4,¿Qué herramientas podrían utilizarse para eval...,Las herramientas que se pueden utilizar para e...
...,...,...
231,¿Cuáles son las diferencias clave entre riesgo...,Los riesgos financieros están relacionados con...
232,¿Por qué es relevante considerar los riesgos n...,Incluir riesgos no financieros en informes fin...
233,¿Qué tecnología puede ayudar en la detección d...,Herramientas de inteligencia artificial y apre...
234,¿Qué rol juega la educación continua en la ges...,La educación continua asegura que los empleado...
