# 3.1 Enrich Startup Dataset

In [1]:
import pandas as pd

In [2]:
df_listings = pd.read_csv("data/eustartup_listings_cleaned.csv")

df_listings.head(2)

Unnamed: 0,name,link_startupeu,link_logo,website,city,region,tags,category,business_description,founded,age,total_funding,company_status,social_links
0,Avdain,https://www.eu-startups.com/directory/avdain/,https://www.eu-startups.com/wp-content/uploads...,avdain.com,Vienna,Vienna,"company, startup, one person",Software & Analytics,Avdain is a enterprise that embodies a fusion ...,2020,5,No funding announced yet,Active,()
1,Popper Power GmbH,https://www.eu-startups.com/directory/popper-p...,https://www.eu-startups.com/wp-content/uploads...,www.popperpower.com,Vienna,Vienna,"ev, battery, bess, charging",Energy,Popper Power GmbH develops advanced energy sto...,2022,3,Between €500K-€ 1 million,Active,"('https://www.linkedin.com/company/86313916',)"


In [4]:
columns = df_listings.columns.to_list()

for column in columns:
    print(column)

name
link_startupeu
link_logo
website
city
region
tags
category
business_description
founded
age
total_funding
company_status
social_links


# Enrich the dataset

## Open Ai API Requests

To assess the quality and get variables out of: 
- business_description


### Description Assessment

The `extract_description_features` function takes a startup’s written description as input and uses the OpenAI `gpt-4o-mini` model to parse it into a structured JSON object containing a rich set of features that capture writing quality, clarity, novelty, market posture, founder credibility, sentiment, traction signals and key textual metrics. It sends a system prompt defining exactly which fields to extract—scores on a 1–10 scale for writing, clarity, innovativeness, market readiness, founder signal, sentiment and traction; raw counts for words, sentences and numeric evidence; a jargon density ratio; a binary flag for calls-to-action; a list of the top three keywords; and a categorical business model tag.  

**Extracted variables**  
- `writing_score` (1–10): how well-written the text is (grammar, fluency, vocabulary)  
- `clarity_score` (1–10): how easily the reader grasps what the startup does  
- `innovativeness_score` (1–10): how novel or differentiated the idea sounds  
- `market_readiness_score` (1–10): how close to product–market fit or revenue launch  
- `founder_signal_score` (1–10): the strength of the founder’s perceived expertise and credibility  
- `sentiment_score` (1–10): overall positivity vs. negativity of the description  
- `traction_score` (1–10): evidence of traction such as user counts, pilots or revenue mentions  
- `word_count` (int): total number of words  
- `sentence_count` (int): total number of sentences  
- `jargon_density` (float 0–1): proportion of domain-specific terms vs. plain language  
- `numeric_evidence_count` (int): count of numeric expressions (e.g. “10,000 users”)  
- `call_to_action_flag` (0 or 1): whether marketing CTAs like “sign up” appear  
- `top_3_keywords` (list): the three most important keywords or phrases summarizing the business  
- `business_model` (enum): one of “B2B”, “B2C”, “B2B2C”, “B2G”, “marketplace”, “platform” or “other”  


In [55]:
import os
import json
from tqdm import tqdm
import openai
from dotenv import load_dotenv
import requests

In [35]:
load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")

print([m.id for m in openai.models.list().data])

['gpt-4o-audio-preview-2024-12-17', 'dall-e-3', 'dall-e-2', 'gpt-4o-audio-preview-2024-10-01', 'gpt-4-turbo-preview', 'text-embedding-3-small', 'gpt-4-turbo', 'gpt-4-turbo-2024-04-09', 'gpt-4.1-nano', 'gpt-4.1-nano-2025-04-14', 'gpt-4o-realtime-preview-2024-10-01', 'gpt-4o-realtime-preview', 'babbage-002', 'gpt-4', 'text-embedding-ada-002', 'chatgpt-4o-latest', 'gpt-4o-realtime-preview-2024-12-17', 'gpt-4o-mini-audio-preview', 'gpt-4o-audio-preview', 'o1-preview-2024-09-12', 'gpt-4o-mini-realtime-preview', 'gpt-4.1-mini', 'gpt-4o-mini-realtime-preview-2024-12-17', 'gpt-3.5-turbo-instruct-0914', 'gpt-4o-mini-search-preview', 'gpt-4.1-mini-2025-04-14', 'davinci-002', 'gpt-3.5-turbo-1106', 'gpt-4o-search-preview', 'gpt-4-1106-preview', 'gpt-3.5-turbo-instruct', 'gpt-3.5-turbo', 'gpt-4o-mini-search-preview-2025-03-11', 'gpt-4-0125-preview', 'gpt-4o-2024-11-20', 'whisper-1', 'gpt-4o-2024-05-13', 'gpt-3.5-turbo-16k', 'gpt-image-1', 'o1-preview', 'gpt-4-0613', 'text-embedding-3-large', 'gpt-4

In [36]:
# Define the extraction function
def extract_description_features(desc: str) -> dict:
    system_prompt = """
You are an expert startup analyst. Given a business description, you will extract the following fields:

- writing_score (int 1–10): grammar, fluency & vocabulary richness
- clarity_score (int 1–10): ease of understanding what they do
- innovativeness_score (int 1–10): novelty & differentiation of the idea
- market_readiness_score (int 1–10): proximity to product–market fit or revenue launch
- founder_signal_score (int 1–10): perceived founder expertise and credibility
- sentiment_score (int 1–10): overall tone positivity vs. negativity
- traction_score (int 1–10): evidence of traction (users, pilots, revenue)
- word_count (int)
- sentence_count (int)
- jargon_density (float between 0 and 1)
- numeric_evidence_count (int)
- call_to_action_flag (0 or 1)
- top_3_keywords (list of three strings)
- business_model (one of: "B2B","B2C","B2B2C","B2G","marketplace","platform","other")

Respond with only the JSON object, no extra text or markdown.
""".strip()
    
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user",   "content": desc}
    ]
    resp = openai.chat.completions.create(
        model="gpt-4o-mini",
        messages=messages,
        temperature=0,
        max_tokens=600
    )
    raw = resp.choices[0].message.content.strip()
    if raw.startswith("```"):
        raw = "\n".join(raw.split("\n")[1:-1]).strip()
    return json.loads(raw)

In [37]:
# Prepare feature keys for fallback
feature_keys = [
    "writing_score", "clarity_score", "innovativeness_score",
    "market_readiness_score", "founder_signal_score", "sentiment_score",
    "traction_score", "word_count", "sentence_count", "jargon_density",
    "numeric_evidence_count", "call_to_action_flag",
    "top_3_keywords", "business_model"
]

# Iterate and extract features
feature_dicts = []
for desc in tqdm(df_listings["business_description"], desc="Extracting features"):
    try:
        feats = extract_description_features(desc)
    except Exception:
        feats = {key: None for key in feature_keys}
    feature_dicts.append(feats)

Extracting features: 100%|██████████| 518/518 [19:20<00:00,  2.24s/it]


In [38]:
# Build df_new from extracted features
df_listings_desc_enriched = pd.DataFrame(feature_dicts)

df_listings_desc_enriched

Unnamed: 0,writing_score,clarity_score,innovativeness_score,market_readiness_score,founder_signal_score,sentiment_score,traction_score,word_count,sentence_count,jargon_density,numeric_evidence_count,call_to_action_flag,top_3_keywords,business_model,innovativness_score
0,8,7,9.0,6,8,9,5,290,12,0.250000,0,0,"[innovation, academic, entrepreneurial]",B2B,
1,7,8,9.0,6,7,8,5,42,2,0.210000,0,0,"[energy storage, EV charging, infrastructure]",B2B,
2,8,9,7.0,8,6,9,7,305,15,0.200000,2,1,"[sales, AI, integration]",B2B,
3,8,9,7.0,8,6,9,7,203,10,0.200000,0,0,"[customer feedback, CX, business goals]",B2B,
4,8,9,7.0,6,5,8,4,335,15,0.200000,0,0,"[AI, platform, efficiency]",B2B,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
513,7,8,9.0,6,5,8,4,22,1,0.136000,0,0,"[automated, conductive, charging]",B2B,
514,6,5,,3,4,6,2,14,1,0.214286,0,0,"[health, tech, enabler]",B2B2C,4.0
515,8,9,,6,5,8,4,8,1,0.000000,0,0,"[real estate, investments, everyone]",B2C,7.0
516,7,8,,5,6,8,4,22,1,0.180000,0,0,"[urban mobility, mobility services, sustainable]",B2B,7.0


In [45]:
# show *all* rows and columns
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

# widen the “page” so it won’t wrap columns  
pd.set_option('display.width', 1000)

In [39]:
# Combine new data with original df
df_listings_enriched = pd.concat([df_listings.reset_index(drop=True), df_listings_desc_enriched], axis=1)

df_listings_enriched

Unnamed: 0,name,link_startupeu,link_logo,website,city,region,tags,category,business_description,founded,...,sentiment_score,traction_score,word_count,sentence_count,jargon_density,numeric_evidence_count,call_to_action_flag,top_3_keywords,business_model,innovativness_score
0,Avdain,https://www.eu-startups.com/directory/avdain/,https://www.eu-startups.com/wp-content/uploads...,avdain.com,Vienna,Vienna,"company, startup, one person",Software & Analytics,Avdain is a enterprise that embodies a fusion ...,2020,...,9,5,290,12,0.250000,0,0,"[innovation, academic, entrepreneurial]",B2B,
1,Popper Power GmbH,https://www.eu-startups.com/directory/popper-p...,https://www.eu-startups.com/wp-content/uploads...,www.popperpower.com,Vienna,Vienna,"ev, battery, bess, charging",Energy,Popper Power GmbH develops advanced energy sto...,2022,...,8,5,42,2,0.210000,0,0,"[energy storage, EV charging, infrastructure]",B2B,
2,Setter AI,https://www.eu-startups.com/directory/setter-ai/,https://www.eu-startups.com/wp-content/uploads...,https://www.trysetter.com,Wien,Vienna,"ai, ai agents, sales & marketing, ai saas, ai ...",Software & Analytics,Speed matters when you want more sales. That’s...,2024,...,9,7,305,15,0.200000,2,1,"[sales, AI, integration]",B2B,
3,SurveySensum,https://www.eu-startups.com/directory/surveyse...,https://www.eu-startups.com/wp-content/uploads...,https://www.surveysensum.com,vienna,Vienna,"customer feedback, customer experience",Software & Analytics,SurveySensum is a leading customer feedback pl...,2018,...,9,7,203,10,0.200000,0,0,"[customer feedback, CX, business goals]",B2B,
4,Artypa,https://www.eu-startups.com/directory/artypa/,https://www.eu-startups.com/wp-content/uploads...,https://artypa.com,Vienna,Vienna,"ai content, ai productivity, ai generation",Software & Analytics,Traditional AI workflows often involve navigat...,2024,...,8,4,335,15,0.200000,0,0,"[AI, platform, efficiency]",B2B,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
513,Ease-Link,https://www.eu-startups.com/directory/ease-link/,https://www.eu-startups.com/wp-content/uploads...,http://easelink.com/,Graz,Styria,"automotive, industrial automation",Mobility,"Ease-Link GmbH develops MATRIX CHARGING®, the ...",2015,...,8,4,22,1,0.136000,0,0,"[automated, conductive, charging]",B2B,
514,Healcloud,https://www.eu-startups.com/directory/healcloud/,https://www.eu-startups.com/wp-content/uploads...,http://www.healcloud.com/,Vienna,Vienna,"big data, health care",Software & Analytics,Healcloud is a European health tech enabler to...,2015,...,6,2,14,1,0.214286,0,0,"[health, tech, enabler]",B2B2C,4.0
515,Rendity,https://www.eu-startups.com/directory/rendity/,https://www.eu-startups.com/wp-content/uploads...,https://rendity.com/,Vienna,Vienna,"finance, financial services, real estate",FinTech/InsurTech,Rendity enables real estate investments for ev...,2015,...,8,4,8,1,0.000000,0,0,"[real estate, investments, everyone]",B2C,7.0
516,Ubiq.ai,https://www.eu-startups.com/directory/ubiq-ai/,https://www.eu-startups.com/wp-content/uploads...,https://www.ubiq.ai/,Vienna,Vienna,"autonomous vehicles, car sharing, internet, la...",Mobility,Ubiq is shaping the future of urban mobility b...,2015,...,8,4,22,1,0.180000,0,0,"[urban mobility, mobility services, sustainable]",B2B,7.0


In [None]:
df_listings_enriched.to_csv("./data/eustartup_listings_enriched_1.csv", index=False)

In [47]:
# 2. Build a smaller DataFrame with only the link, description, + those features
cols = ["link_startupeu", "business_description", "founded"] + feature_keys
df_subset = df_listings_enriched[cols]

# 3. Filter to rows where any of those feature columns is NaN
df_with_nans = df_subset[df_subset[feature_keys].isna().any(axis=1)]

In [48]:
# Count NaNs in each column of df_with_nans
nan_counts = df_with_nans.isna().sum()

print(nan_counts)

link_startupeu              0
business_description        0
founded                     0
writing_score               0
clarity_score               0
innovativeness_score      123
market_readiness_score      0
founder_signal_score        0
sentiment_score             0
traction_score              0
word_count                  0
sentence_count              0
jargon_density              0
numeric_evidence_count      0
call_to_action_flag         0
top_3_keywords              0
business_model              0
dtype: int64


In [53]:
df_with_nans

Unnamed: 0,link_startupeu,business_description,founded,writing_score,clarity_score,innovativeness_score,market_readiness_score,founder_signal_score,sentiment_score,traction_score,word_count,sentence_count,jargon_density,numeric_evidence_count,call_to_action_flag,top_3_keywords,business_model
10,https://www.eu-startups.com/directory/finslice/,Finslice is an exchange that enables users to ...,2024,8,9,,6,5,8,4,85,5,0.200000,0,0,"[fractional investment, tangible assets, ETF-l...",platform
13,https://www.eu-startups.com/directory/wasitai/,WasItAI is a service for determining if images...,2024,7,8,,6,5,7,4,30,2,0.100000,0,0,"[images, AI, origin]",B2B
15,https://www.eu-startups.com/directory/hypertxt/,Hypertxt is a cutting-edge AI-powered content ...,2023,9,9,,7,6,9,5,290,12,0.150000,1,1,"[content creation, SEO-optimized, blog posts]",B2B
21,https://www.eu-startups.com/directory/evolushost/,Evolushost is a leading provider of robust clo...,2020,8,9,,8,7,9,6,319,15,0.200000,0,1,"[cloud solutions, infrastructure, dedicated se...",B2B
23,https://www.eu-startups.com/directory/linkedin...,LinkedIn Company Insights can quickly show you...,2024,7,8,,5,4,7,3,42,3,0.140000,0,0,"[insights, competition, growth]",B2B
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
484,https://www.eu-startups.com/directory/bsurance-2/,Support for companies in global scaling and ex...,2017,6,7,,5,5,6,4,24,1,0.250000,0,0,"[global scaling, digitization, insurance produ...",B2B
506,https://www.eu-startups.com/directory/webcp/,Webcp ist ein Unternehmen aus Oberösterreich d...,2015,7,8,,6,5,8,4,25,2,0.200000,0,0,"[Webdesign, Suchmaschinenoptimierung, Kundenzu...",B2B
514,https://www.eu-startups.com/directory/healcloud/,Healcloud is a European health tech enabler to...,2015,6,5,,3,4,6,2,14,1,0.214286,0,0,"[health, tech, enabler]",B2B2C
515,https://www.eu-startups.com/directory/rendity/,Rendity enables real estate investments for ev...,2015,8,9,,6,5,8,4,8,1,0.000000,0,0,"[real estate, investments, everyone]",B2C
