# Synthetic Dataset Generation for DPO
---
Aug. 5, 2024

## 1. Topic and Subtopic Generation

In [None]:
import vertexai
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser

def init_vertexai(PROJECT_ID: str, REGION: str):
    vertexai.init(project=PROJECT_ID, location=REGION)

def create_llm(api_key: str, temperature: float = 0.8, top_p: float = 0.95):
    return ChatOpenAI(
        base_url=f"https://{REGION}-aiplatform.googleapis.com/v1beta1/projects/{PROJECT_ID}/locations/{REGION}/endpoints/openapi/chat/completions?",
        model="meta/llama3-405b-instruct-maas",
        api_key=api_key,
        max_tokens=4096,
        temperature=temperature,
        top_p=top_p,
    )

def create_prompt_template():
    return ChatPromptTemplate.from_messages([
        ("system", "<|begin_of_text|><|start_header_id|>system<|end_header_id|>{system}<|eot_id|>"),
        ("human", "<|start_header_id|>user<|end_header_id|>{human}<|eot_id|>"),
        ("ai", "<|start_header_id|>assistant<|end_header_id|>{ai}<|eot_id|>"),
    ])

def execute_llama_chain(system: str, user: str, llm: ChatOpenAI, prompt: ChatPromptTemplate, stream: bool = True):
    chain = prompt | llm | StrOutputParser()
    messages = [
        SystemMessage(content=system),
        HumanMessage(content=user),
    ]
    if stream:
        return chain.stream({"system": system, "human": user, "ai": ""})
    else:
        return chain.invoke({"system": system, "human": user, "ai": ""})

# Initialize Vertex AI and create LLM
init_vertexai(PROJECT_ID, REGION)
llm = create_llm(get_credentials())
prompt = create_prompt_template()

# Generate topics and subtopics
topics_system = "You are a master categorizer."
topics_user = """
Generate a list of 10 diverse topics related to current events, technology, and general knowledge.
Only generate the number of topics requested.
The topics should be separated by a comma. There must be no other text than the list.
"""

full_topics = ""
for chunk in execute_llama_chain(system=topics_system, user=topics_user, llm=llm, prompt=prompt):
    full_topics += chunk

topics = [topic.strip() for topic in full_topics.split(",")]

for topic in topics:
    subtopics_user = f"""
    Given the topic "{topic}", generate a list of 5 subtopics that are related to it.
    Only generate the number of subtopics requested.
    The subtopics should be separated by a comma. There must be no other text than the list.
    """
    
    full_subtopics = ""
    for chunk in execute_llama_chain(system=topics_system, user=subtopics_user, llm=llm, prompt=prompt):
        full_subtopics += chunk
    
    subtopics = [subtopic.strip() for subtopic in full_subtopics.split(",")]
    print(f"Topic: {topic}")
    print(f"Subtopics: {subtopics}\n")

## 2. Question Generation

In [None]:
def generate_questions(topic: str, subtopic: str, n_questions: int = 5):
    questions_system = f"""
    You are a guru in the topic of {topic} with a particular specialization in {subtopic}.
    You are giving a quiz to your students of varying levels of expertise and age.
    """
    
    questions_user = f"""
    Generate exactly {n_questions} questions that could be asked about {subtopic}.
    The questions should be of varying difficulty and complexity.
    Your response should be in a list format.
    The questions should be separated by "|". There must be no other text than the list.
    """
    
    full_questions = ""
    for chunk in execute_llama_chain(system=questions_system, user=questions_user, llm=llm, prompt=prompt):
        full_questions += chunk
    
    return [question.strip() for question in full_questions.split("|")]

questions_list = []
for topic in topics:
    for subtopic in subtopics:
        questions = generate_questions(topic, subtopic)
        questions_list.extend([{"topic": topic, "subtopic": subtopic, "question": q} for q in questions])

print(f"Generated {len(questions_list)} questions.")

## 3. Answer Generation

### A. Llama-3.1-405B-Instruct

In [None]:
def generate_answers(question: str, topic: str, subtopic: str):
    answer_system = f"""
    You are a master of the topic {topic} with a specialization and PhD in {subtopic}.
    """
    
    answer_user = f"""
    Given the question: "{question}"
    Generate 2 responses (response A and response B) to the question.
    Insert response A below where it says RESPONSE A: and insert response B below where it says RESPONSE B:
    RESPONSE A:
    RESPONSE B:
    
    The above is the output format and should not be changed. Do not output anything else other than the responses.
    """
    
    full_answers = ""
    for chunk in execute_llama_chain(system=answer_system, user=answer_user, llm=llm, prompt=prompt):
        full_answers += chunk
    
    parts = full_answers.split("RESPONSE B:")
    response_a = parts[0].replace("RESPONSE A:", "").strip()
    response_b = parts[1].strip()
    
    return {"response_a": response_a, "response_b": response_b}

answers_list = []
for item in questions_list:
    answers = generate_answers(item["question"], item["topic"], item["subtopic"])
    answers_list.append({**item, "responses": answers})

print(f"Generated answers for {len(answers_list)} questions.")

### B. Gemini-1.5-pro-experimental (0801)

In [None]:
import google.generativeai as genai

# Configure the Gemini API
genai.configure(api_key="YOUR_GEMINI_API_KEY")

def generate_answers_gemini(question: str, topic: str, subtopic: str):
    model = genai.GenerativeModel('gemini-1.5-pro')
    
    prompt = f"""
    As an expert in {topic} with a specialization in {subtopic}, provide two distinct responses to the following question:

    Question: {question}

    RESPONSE A:
    [Your first response here]

    RESPONSE B:
    [Your second response here]

    Ensure that both responses are informative and potentially correct, but with different approaches or perspectives.
    """
    
    response = model.generate_content(prompt)
    
    # Parse the response to extract Response A and Response B
    content = response.text
    parts = content.split("RESPONSE B:")
    response_a = parts[0].replace("RESPONSE A:", "").strip()
    response_b = parts[1].strip()
    
    return {"response_a": response_a, "response_b": response_b}

# To use Gemini instead of Llama, replace the generate_answers function call in the previous loop
answers_list = []
for item in questions_list:
    answers = generate_answers_gemini(item["question"], item["topic"], item["subtopic"])
    answers_list.append({**item, "responses": answers})

print(f"Generated answers using Gemini for {len(answers_list)} questions.")

## 4. Evaluation and Scoring

In [None]:
from openai import OpenAI

def get_scores_from_response(openai_response):
    logprobs = openai_response.choices[0].logprobs.content
    score_dict = {}
    for score in logprobs:
        score_dict[score.token] = score.logprob
    return score_dict

def evaluate_responses(question: str, response_a: str, response_b: str):
    client = OpenAI(
        base_url="https://integrate.api.nvidia.com/v1",
        api_key="YOUR_NVIDIA_API_KEY"
    )
    
    messages_a = [
        {"role": "user", "content": question},
        {"role": "assistant", "content": response_a}
    ]
    
    messages_b = [
        {"role": "user", "content": question},
        {"role": "assistant", "content": response_b}
    ]
    
    response_a_eval = client.chat.completions.create(
        model="nvidia/nemotron-4-340b-reward",
        messages=messages_a,
    )
    
    response_b_eval = client.chat.completions.create(
        model="nvidia/nemotron-4-340b-reward",
        messages=messages_b,
    )
    
    scores_a = get_scores_from_response(response_a_eval)
    scores_b = get_scores_from_response(response_b_eval)
    
    return {
        "response_a_score": sum(scores_a.values()),
        "response_b_score": sum(scores_b.values())
    }

# Evaluate and score the responses
for item in answers_list:
    scores = evaluate_responses(item["question"], item["responses"]["response_a"], item["responses"]["response_b"])
    item["scores"] = scores

print("Completed evaluation and scoring of responses.")

## 5. Final Dataset Filtering

In [None]:
def filter_dataset(answers_list, threshold=3.5):
    filtered_data = []
    for item in answers_list:
        if item["scores"]["response_a_score"] >= threshold or item["scores"]["response_b_score"] >= threshold:
            filtered_data.append({
                "question": item["question"],
                "chosen": item["responses"]["response_a"] if item["scores"]["response_a_score"] >= item["scores"]["response_b_score"] else item["responses"]["response_b"],
                "rejected": item["responses"]["response_b"] if item["scores"]["response_a_score"] >= item["scores"]["response_b_score"] else item["responses"]["response_a"]
            })
    return filtered_data

final_dataset = filter_dataset(answers_list)
print(f"Final dataset contains {len(final_dataset)} high-quality examples.")

# Save the final dataset
import json
with open("dpo_synthetic_dataset.json", "w") as f:
    json.dump(final_dataset, f, indent=2)

print("Saved final dataset to dpo_synthetic_dataset.json")

---
#### E.O.D