In [None]:
from dotenv import load_dotenv
import os

load_dotenv()
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")


In [None]:
import logging
import random
from bespokelabs import curator

logger = logging.getLogger("bespokelabs.curator")
logger.setLevel(logging.INFO)


In [None]:
from pydantic import BaseModel, Field

class StartupIdea(BaseModel):
    idea: str = Field(description="A single creative, viable startup idea.")

class StartupIdeaGenerator(curator.LLM):
    """LLM-based generator for creative startup ideas."""
    response_format = StartupIdea

    def prompt(self, input: dict) -> str:
        index = input["index"]
        return (
            f"Generate Startup Idea #{index}.\n\n"
            "Requirements:\n"
            "- The idea should be creative but plausible.\n"
            "- Cover a real-world problem or market need.\n"
            "- Be specific about the product or service.\n"
            "- Mention the potential target audience.\n"
            "- Avoid buzzwords like 'disruptive', 'revolutionary', etc.\n"
            "- Make sure the idea is understandable by a layperson.\n\n"
            "Return only the startup idea text, no extra commentary."
        )

    def parse(self, input: dict, response: StartupIdea) -> dict:
        return {
            "index": input["index"],
            "idea": response.idea.strip()
        }

generator = StartupIdeaGenerator(
    model_name="gpt-4o-mini",
    backend="openai",
    batch=False
)


In [None]:
inputs = [{"index": i + 1} for i in range(200)]
startup_ideas = generator(inputs)


In [None]:
random_idea = random.choice(startup_ideas)
print(f"Startup Idea #{random_idea['index']}:\n\n{random_idea['idea']}")


Startup Idea #54:

A mobile app called "Seniors Connect" that serves as a social networking platform tailored specifically for elderly users. The app combines features of traditional social media with tools to promote mental health, physical activity, and community engagement. Users can share stories, photos, and experiences, participate in group activities like virtual book clubs or exercise sessions, and access resources for health management and emergency help. The target audience includes seniors aged 65 and over, their families looking to keep them socially active, and retirement communities aiming to enhance resident engagement. The goal is to combat loneliness and improve overall well-being for older adults.


In [None]:
from together import Together
import os

os.environ["TOGETHER_API_KEY"] = os.getenv("TOGETHER_API_KEY")
client = Together(api_key=os.environ["TOGETHER_API_KEY"])


In [None]:
import time
from tqdm import tqdm
from collections import deque
import random

# Reasoning loop config
max_qpm = 60  # maximum queries per minute
window_seconds = 60
request_timestamps = deque()

response_times = []
error_count = 0
max_tokens = 2048

junior_partner_reasonings = []
hawk_reasonings = []
fin_reasonings = []
fit_reasonings = []

# Reasoning loop
for startup in tqdm(startup_ideas, desc="Generating Reasoning with DeepSeek"):
    idx = startup["index"]
    idea = startup["idea"]

    junior_prompt = f"""
You are a junior VC partner at Reasoned Capital. You are preparing to pitch the following startup idea to the General Partner cohort.

Startup Idea:
{idea}

Explain whether you think it's a good or bad idea overall. Highlight both the strengths and weaknesses, but keep it professional and concise. Output the internal thought process, not a final decision.

Return only your thoughts, no formatting.
""".strip()

    # Rate limit management
    now = time.time()
    while request_timestamps and now - request_timestamps[0] > window_seconds:
        request_timestamps.popleft()
    if len(request_timestamps) >= max_qpm:
        sleep_time = window_seconds - (now - request_timestamps[0])
        print(f"⏳ Rate limit hit — sleeping {sleep_time:.1f}s...")
        time.sleep(sleep_time)
    request_timestamps.append(time.time())

    try:
        req_start = time.time()
        response = client.chat.completions.create(
            model="deepseek-ai/DeepSeek-R1",
            messages=[{"role": "user", "content": junior_prompt}],
            max_tokens=max_tokens
        )
        req_end = time.time()
        response_times.append(req_end - req_start)

        junior_text = response.choices[0].message.content.strip()

    except Exception as e:
        error_count += 1
        print(f"⚠️ Error on idea {idx} (junior partner): {e}")
        print("⏱️ Retrying after 60 seconds...")
        time.sleep(60)
        try:
            retry_start = time.time()
            response = client.chat.completions.create(
                model="deepseek-ai/DeepSeek-R1",
                messages=[{"role": "user", "content": junior_prompt}],
                max_tokens=max_tokens
            )
            retry_end = time.time()
            response_times.append(retry_end - retry_start)

            junior_text = response.choices[0].message.content.strip()

        except Exception as e2:
            error_count += 1
            print(f"❌ Retry failed on idea {idx} (junior partner): {e2}")
            junior_text = "Error generating junior partner pitch."
            response_times.append(0.0)

    junior_partner_reasonings.append(junior_text)

    ## Step 2: Hawk / Fin / Fit Reasoning
    partner_roles = {
        "hawk": "You are HAWK, a General Partner at Reasoned Capital. You deeply care about whether an idea is solid and marketable. Be skeptical. Think hard about the possible risks and flaws.",
        "fin": "You are FIN, a General Partner at Reasoned Capital. You are obsessed with financial viability. Think hard about whether this idea could realistically generate good returns.",
        "fit": "You are FIT, a General Partner at Reasoned Capital. You deeply care about founder-market fit. Think about whether the founders (assumed) would be the right people to build this company."
    }

    for partner, role_prompt in partner_roles.items():
        partner_prompt = f"""
<think>
{role_prompt}

Here is the startup idea:
{idea}

Here is what the junior partner thought:
{junior_text}

Think carefully about whether Reasoned Capital should invest. ONLY write your internal reasoning, not a final decision.

Return ONLY inside this <think>...</think> block.
</think>
""".strip()

        # Rate limit management
        now = time.time()
        while request_timestamps and now - request_timestamps[0] > window_seconds:
            request_timestamps.popleft()
        if len(request_timestamps) >= max_qpm:
            sleep_time = window_seconds - (now - request_timestamps[0])
            print(f"⏳ Rate limit hit — sleeping {sleep_time:.1f}s...")
            time.sleep(sleep_time)
        request_timestamps.append(time.time())

        try:
            req_start = time.time()
            response = client.chat.completions.create(
                model="deepseek-ai/DeepSeek-R1",
                messages=[{"role": "user", "content": partner_prompt}],
                stop=["</think>"],
                max_tokens=max_tokens
            )
            req_end = time.time()
            response_times.append(req_end - req_start)

            partner_text = response.choices[0].message.content.strip()
            clean_text = partner_text.replace("\n", " ").replace("\r", " ").strip() + " </think>"

        except Exception as e:
            error_count += 1
            print(f"⚠️ Error on idea {idx} ({partner}): {e}")
            print("⏱️ Retrying after 60 seconds...")
            time.sleep(60)
            try:
                retry_start = time.time()
                response = client.chat.completions.create(
                    model="deepseek-ai/DeepSeek-R1",
                    messages=[{"role": "user", "content": partner_prompt}],
                    stop=["</think>"],
                    max_tokens=max_tokens
                )
                retry_end = time.time()
                response_times.append(retry_end - retry_start)

                partner_text = response.choices[0].message.content.strip()
                clean_text = partner_text.replace("\n", " ").replace("\r", " ").strip() + " </think>"

            except Exception as e2:
                error_count += 1
                print(f"❌ Retry failed on idea {idx} ({partner}): {e2}")
                clean_text = "<think>Error generating reasoning</think>"
                response_times.append(0.0)

        if partner == "hawk":
            hawk_reasonings.append(clean_text)
        elif partner == "fin":
            fin_reasonings.append(clean_text)
        elif partner == "fit":
            fit_reasonings.append(clean_text)


Generating Reasoning with DeepSeek:   8%|▊         | 15/200 [33:41<6:54:56, 134.58s/it]

⚠️ Error on idea 16 (fit): Error code: 502 - Error code: 502 -<html>
<head><title>502 Bad Gateway</title></head>
<body>
<center><h1>502 Bad Gateway</h1></center>
<hr><center>cloudflare</center>
</body>
</html>

⏱️ Retrying after 60 seconds...


Generating Reasoning with DeepSeek:   8%|▊         | 16/200 [37:09<8:00:44, 156.76s/it]

⚠️ Error on idea 17 (junior partner): Error code: 500 - {"message": "Internal Server Error"}
⏱️ Retrying after 60 seconds...


Generating Reasoning with DeepSeek:  80%|████████  | 160/200 [6:38:19<1:35:16, 142.92s/it]

⚠️ Error on idea 161 (fit): Error code: 502 - Error code: 502 -<html>
<head><title>502 Bad Gateway</title></head>
<body>
<center><h1>502 Bad Gateway</h1></center>
<hr><center>cloudflare</center>
</body>
</html>

⏱️ Retrying after 60 seconds...


Generating Reasoning with DeepSeek: 100%|██████████| 200/200 [8:13:08<00:00, 147.94s/it]  


In [None]:
sample_idx = random.randint(0, len(startup_ideas) - 1)

print(f"Startup Idea #{startup_ideas[sample_idx]['index']}:\n\n{startup_ideas[sample_idx]['idea']}")
print("\n--- JUNIOR PARTNER THOUGHTS ---\n")
print(junior_partner_reasonings[sample_idx])
print("\n--- HAWK REASONING ---\n")
print(hawk_reasonings[sample_idx])
print("\n--- FIN REASONING ---\n")
print(fin_reasonings[sample_idx])
print("\n--- FIT REASONING ---\n")
print(fit_reasonings[sample_idx])


Startup Idea #186:

A subscription service that delivers healthy meal kits specifically designed for different dietary restrictions and preferences, such as gluten-free, vegan, or keto. The service would include easy-to-follow recipes along with pre-portioned ingredients sourced from local farms. 

The target audience would be health-conscious individuals, families with members who have specific dietary needs, or busy professionals who want to eat healthy without the hassle of meal planning and grocery shopping.

--- JUNIOR PARTNER THOUGHTS ---

<think>
Okay, let's start by breaking down this startup idea. A subscription meal kit service tailored for dietary restrictions and preferences. The target audience is health-conscious individuals, families with dietary needs, and busy professionals. Hmm, the first thing that comes to mind is the growing trend of personalized nutrition. People are more aware of their dietary needs now, whether it's gluten-free, vegan, keto, etc. So there's defi

In [None]:
import pandas as pd

final_dataset = pd.DataFrame({
    "index": [startup["index"] for startup in startup_ideas],
    "idea": [startup["idea"] for startup in startup_ideas],
    "junior_partner_pitch": junior_partner_reasonings,
    "hawk_reasoning": hawk_reasonings,
    "fin_reasoning": fin_reasonings,
    "fit_reasoning": fit_reasonings,
})

final_dataset.head()


Unnamed: 0,index,idea,junior_partner_pitch,hawk_reasoning,fin_reasoning,fit_reasoning
0,1,A subscription-based meal plan service that sp...,"<think>\nOkay, let's start by breaking down th...",<think> **Market Validation:** While the dema...,<think> **Market Demand & Differentiation**: ...,"<think> Alright, let's dive deeper into the fo..."
1,2,A subscription-based service providing tailore...,"<think>\nOkay, let's start by breaking down th...",<think> **Market Validation:** - **Niche De...,<think> **Market Validation**: The growing pr...,<think> **Founder-Market Fit Analysis:** 1. ...
2,3,A subscription-based service that delivers per...,"<think>\nOkay, let's start by understanding th...",<think> **Market Validation:** The target mar...,"<think> Alright, let's break this down. The m...","<think> Alright, let's break this down. The s..."
3,4,A subscription-based service that provides per...,"<think>\nOkay, let's start by looking at the s...",<think> **Market Validation**: While the dema...,<think> **Market Validation**: The increasing...,"<think> Alright, let's dive deeper into the fo..."
4,5,"A subscription service called ""MealMatch"" that...","<think>\nOkay, let me start by breaking down t...",<think> **Market Validation:** The target aud...,<think> **Market Validation:** The target dem...,<think> **Founder-Market Fit Analysis:** 1. ...


In [None]:
import time
from tqdm import tqdm
from collections import deque
import re

manager_partner_thinks = []
manager_partner_decisions = []
manager_partner_explanations = []

request_timestamps = deque()

def parse_manager_output(full_text):
    """Parse manager partner output into think, decision, explanation."""
    think_text = ""
    decision_text = ""
    explanation_text = ""

    try:
        think_match = re.search(r"<think>(.*?)</think>", full_text, re.DOTALL | re.IGNORECASE)
        if think_match:
            think_text = think_match.group(1).strip()
        else:
            think_text = "Missing <think> block"

        decision_match = re.search(r"DECISION:\s*(.+)", full_text, re.IGNORECASE)
        if decision_match:
            decision_text = decision_match.group(1).strip()

        explanation_match = re.search(r"EXPLANATION:\s*(.+)", full_text, re.IGNORECASE)
        if explanation_match:
            explanation_text = explanation_match.group(1).strip()

    except Exception as e:
        print(f"⚠️ Parsing error: {e}")
        think_text = "Parse error"
        decision_text = "Parse error"
        explanation_text = "Parse error"

    return think_text, decision_text, explanation_text

# Reasoning loop
for i, row in tqdm(final_dataset.iterrows(), total=len(final_dataset), desc="Generating Manager Partner Reasoning"):
    idea = row["idea"]
    junior_pitch = row["junior_partner_pitch"]
    hawk = row["hawk_reasoning"]
    fin = row["fin_reasoning"]
    fit = row["fit_reasoning"]

    manager_prompt = f"""
You are the MANAGING PARTNER at Reasoned Capital.

You are evaluating whether to invest in the following startup. You have the following information:

Startup Idea:
{idea}

Junior Partner Thoughts:
{junior_pitch}

General Partner Reasonings:
- Hawk: {hawk}
- Fin: {fin}
- Fit: {fit}

TASK:
1. FIRST, REASON internally inside a <think>...</think> block. DO NOT include any decision or explanation here.
2. AFTER the </think> block, WRITE:

DECISION: [Invest] or [Do not invest]

EXPLANATION: A very short 1–2 sentence explanation why you decided to invest or not.

IMPORTANT: 
- Keep DECISION and EXPLANATION outside the <think> block.
- Follow the exact format shown.
""".strip()

    # Rate limit management
    now = time.time()
    while request_timestamps and now - request_timestamps[0] > 60:
        request_timestamps.popleft()
    if len(request_timestamps) >= 60:
        sleep_time = 60 - (now - request_timestamps[0])
        print(f"⏳ Rate limit hit — sleeping {sleep_time:.1f}s...")
        time.sleep(sleep_time)
    request_timestamps.append(time.time())

    try:
        response = client.chat.completions.create(
            model="deepseek-ai/DeepSeek-R1",
            messages=[{"role": "user", "content": manager_prompt}],
            max_tokens=2048
        )
        full_text = response.choices[0].message.content.strip()

    except Exception as e:
        print(f"⚠️ Error on idea {row['index']} (manager partner): {e}")
        print("⏱️ Retrying after 60 seconds...")
        time.sleep(60)
        try:
            response = client.chat.completions.create(
                model="deepseek-ai/DeepSeek-R1",
                messages=[{"role": "user", "content": manager_prompt}],
                max_tokens=2048
            )
            full_text = response.choices[0].message.content.strip()

        except Exception as e2:
            print(f"❌ Retry failed on idea {row['index']} (manager partner): {e2}")
            full_text = "<think>Error generating manager partner reasoning</think>\nDECISION: Do not invest\nEXPLANATION: Error."

    think_text, decision_text, explanation_text = parse_manager_output(full_text)

    manager_partner_thinks.append(think_text)
    manager_partner_decisions.append(decision_text)
    manager_partner_explanations.append(explanation_text)


Generating Manager Partner Reasoning:   4%|▍         | 9/200 [04:50<1:43:40, 32.57s/it]

⚠️ Error on idea 10 (manager partner): Error code: 429 - {"message": "You are on tier Build Tier 1, which offers 3 queries and 180000 tokens per minute for this model. Please upgrade to higher tier for higher rate limit at https://api.together.xyz/settings/billing.", "type_": "model_rate_limit"}
⏱️ Retrying after 60 seconds...


Generating Manager Partner Reasoning:  16%|█▌        | 31/200 [18:09<1:20:19, 28.51s/it]

⚠️ Error on idea 32 (manager partner): Error code: 503 - The server is overloaded or not ready yet.
⏱️ Retrying after 60 seconds...


Generating Manager Partner Reasoning:  36%|███▌      | 71/200 [41:19<1:11:03, 33.05s/it]

⚠️ Error on idea 72 (manager partner): Error code: 429 - {"message": "You are on tier Build Tier 1, which offers 3 queries and 180000 tokens per minute for this model. Please upgrade to higher tier for higher rate limit at https://api.together.xyz/settings/billing.", "type_": "model_rate_limit"}
⏱️ Retrying after 60 seconds...


Generating Manager Partner Reasoning:  36%|███▌      | 72/200 [42:47<1:46:02, 49.71s/it]

⚠️ Error on idea 73 (manager partner): Error code: 429 - {"message": "You are on tier Build Tier 1, which offers 3 queries and 180000 tokens per minute for this model. Please upgrade to higher tier for higher rate limit at https://api.together.xyz/settings/billing.", "type_": "model_rate_limit"}
⏱️ Retrying after 60 seconds...


Generating Manager Partner Reasoning:  38%|███▊      | 76/200 [45:59<1:27:11, 42.19s/it]

⚠️ Error on idea 77 (manager partner): Error code: 429 - {"message": "You are on tier Build Tier 1, which offers 3 queries and 180000 tokens per minute for this model. Please upgrade to higher tier for higher rate limit at https://api.together.xyz/settings/billing.", "type_": "model_rate_limit"}
⏱️ Retrying after 60 seconds...


Generating Manager Partner Reasoning:  42%|████▏     | 84/200 [51:21<1:06:03, 34.17s/it]

⚠️ Error on idea 85 (manager partner): Error code: 429 - {"message": "You are on tier Build Tier 1, which offers 3 queries and 180000 tokens per minute for this model. Please upgrade to higher tier for higher rate limit at https://api.together.xyz/settings/billing.", "type_": "model_rate_limit"}
⏱️ Retrying after 60 seconds...


Generating Manager Partner Reasoning:  50%|█████     | 100/200 [1:01:04<50:02, 30.03s/it]

⚠️ Error on idea 101 (manager partner): Error code: 429 - {"message": "You are on tier Build Tier 1, which offers 3 queries and 180000 tokens per minute for this model. Please upgrade to higher tier for higher rate limit at https://api.together.xyz/settings/billing.", "type_": "model_rate_limit"}
⏱️ Retrying after 60 seconds...


Generating Manager Partner Reasoning:  58%|█████▊    | 116/200 [1:11:05<40:44, 29.11s/it]  

⚠️ Error on idea 117 (manager partner): Error code: 429 - {"message": "You are on tier Build Tier 1, which offers 3 queries and 180000 tokens per minute for this model. Please upgrade to higher tier for higher rate limit at https://api.together.xyz/settings/billing.", "type_": "model_rate_limit"}
⏱️ Retrying after 60 seconds...


Generating Manager Partner Reasoning:  69%|██████▉   | 138/200 [1:24:24<32:56, 31.88s/it]  

⚠️ Error on idea 139 (manager partner): Error code: 429 - {"message": "You are on tier Build Tier 1, which offers 3 queries and 180000 tokens per minute for this model. Please upgrade to higher tier for higher rate limit at https://api.together.xyz/settings/billing.", "type_": "model_rate_limit"}
⏱️ Retrying after 60 seconds...


Generating Manager Partner Reasoning:  79%|███████▉  | 158/200 [1:36:37<23:05, 32.99s/it]

⚠️ Error on idea 159 (manager partner): Error code: 429 - {"message": "You are on tier Build Tier 1, which offers 3 queries and 180000 tokens per minute for this model. Please upgrade to higher tier for higher rate limit at https://api.together.xyz/settings/billing.", "type_": "model_rate_limit"}
⏱️ Retrying after 60 seconds...


Generating Manager Partner Reasoning:  98%|█████████▊| 197/200 [1:59:54<01:34, 31.60s/it]

⚠️ Error on idea 198 (manager partner): Error code: 429 - {"message": "You are on tier Build Tier 1, which offers 3 queries and 180000 tokens per minute for this model. Please upgrade to higher tier for higher rate limit at https://api.together.xyz/settings/billing.", "type_": "model_rate_limit"}
⏱️ Retrying after 60 seconds...


Generating Manager Partner Reasoning: 100%|██████████| 200/200 [2:02:37<00:00, 36.79s/it]


In [None]:
final_dataset["manager_partner_think"] = manager_partner_thinks
final_dataset["manager_partner_decision"] = manager_partner_decisions
final_dataset["manager_partner_explanation"] = manager_partner_explanations

final_dataset.head()


Unnamed: 0,index,idea,junior_partner_pitch,hawk_reasoning,fin_reasoning,fit_reasoning,manager_partner_think,manager_partner_decision,manager_partner_explanation
0,1,A subscription-based meal plan service that sp...,"<think>\nOkay, let's start by breaking down th...",<think> **Market Validation:** While the dema...,<think> **Market Demand & Differentiation**: ...,"<think> Alright, let's dive deeper into the fo...","Okay, let's start by considering the market po...",Invest,The startup addresses a growing niche with str...
1,2,A subscription-based service providing tailore...,"<think>\nOkay, let's start by breaking down th...",<think> **Market Validation:** - **Niche De...,<think> **Market Validation**: The growing pr...,<think> **Founder-Market Fit Analysis:** 1. ...,"Okay, let's start by considering the market ne...",Do not invest,High operational complexity and regulatory ris...
2,3,A subscription-based service that delivers per...,"<think>\nOkay, let's start by understanding th...",<think> **Market Validation:** The target mar...,"<think> Alright, let's break this down. The m...","<think> Alright, let's break this down. The s...","Okay, let's start by considering the key facto...",Do not invest,High operational complexity and regulatory ris...
3,4,A subscription-based service that provides per...,"<think>\nOkay, let's start by looking at the s...",<think> **Market Validation**: While the dema...,<think> **Market Validation**: The increasing...,"<think> Alright, let's dive deeper into the fo...","Okay, let's break this down. The startup is ta...",Do not invest,High operational complexity and thin margins i...
4,5,"A subscription service called ""MealMatch"" that...","<think>\nOkay, let me start by breaking down t...",<think> **Market Validation:** The target aud...,<think> **Market Validation:** The target dem...,<think> **Founder-Market Fit Analysis:** 1. ...,"Okay, let's start by synthesizing all the inpu...",Do not invest,While the concept addresses a valid market nee...


In [None]:
from huggingface_hub import HfApi, HfFolder, Repository, create_repo


In [None]:
HF_TOKEN = os.getenv("HF_API_KEY")

if not HF_TOKEN:
    raise ValueError("❌ No Hugging Face token found. Please set HF_TOKEN in your .env file.")

HfFolder.save_token(HF_TOKEN)
api = HfApi()


In [None]:
dataset_name = "synthetic_vc_financial_decisions_reasoning_dataset"
save_dir = f"./{dataset_name}"

os.makedirs(save_dir, exist_ok=True)
final_dataset.to_parquet(f"{save_dir}/data.parquet", index=False)

print(f"✅ Dataset saved to {save_dir}/data.parquet")


✅ Dataset saved to ./synthetic_vc_financial_decisions_reasoning_dataset/data.parquet


In [None]:
from huggingface_hub import create_repo

repo_id = "ZennyKenny/synthetic_vc_financial_decisions_reasoning_dataset"

create_repo(
    repo_id, 
    repo_type="dataset",
    private=False,
    exist_ok=True,
    token=HF_TOKEN
)

print(f"✅ Public dataset repo {repo_id} created successfully.")


✅ Public dataset repo ZennyKenny/synthetic_vc_financial_decisions_reasoning_dataset created successfully.


In [None]:
from huggingface_hub import upload_file

# Define repo ID
repo_id = "ZennyKenny/synthetic_vc_financial_decisions_reasoning_dataset"

upload_file(
    path_or_fileobj=f"{save_dir}/data.parquet",
    path_in_repo="data/data.parquet",
    repo_id=repo_id,
    repo_type="dataset",
    token=HF_TOKEN
)

print(f"✅ Uploaded data.parquet to https://huggingface.co/datasets/{repo_id}")


data.parquet:   0%|          | 0.00/1.66M [00:00<?, ?B/s]

✅ Uploaded data.parquet to https://huggingface.co/datasets/ZennyKenny/synthetic_vc_financial_decisions_reasoning_dataset
