# 3.3 Enrich Dataset

In [3]:
import pandas as pd

In [9]:
# show *all* rows and columns
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

# widen the “page” so it won’t wrap columns  
pd.set_option('display.width', 1000)

In [18]:
df_listings = pd.read_csv("./data/eustartup_listings_enriched_2.csv")

df_listings.head()

Unnamed: 0,name,link_startupeu,link_logo,website,city,region,tags,category,business_description,founded,age,total_funding,company_status,social_links,writing_score,clarity_score,innovativeness_score,market_readiness_score,founder_signal_score,sentiment_score,traction_score,word_count,sentence_count,jargon_density,numeric_evidence_count,call_to_action_flag,top_3_keywords,business_model,website_valid,website_clean,website_up,website_response_ms,redirected_url
0,Avdain,https://www.eu-startups.com/directory/avdain/,https://www.eu-startups.com/wp-content/uploads...,avdain.com,Vienna,Vienna,"company, startup, one person",Software & Analytics,Avdain is a enterprise that embodies a fusion ...,2020,5,No funding announced yet,Active,(),8,7,9.0,6,8,9,5,290,12,0.25,0,0,"['innovation', 'academic', 'entrepreneurial']",B2B,True,https://avdain.com,True,131.8,
1,Popper Power GmbH,https://www.eu-startups.com/directory/popper-p...,https://www.eu-startups.com/wp-content/uploads...,www.popperpower.com,Vienna,Vienna,"ev, battery, bess, charging",Energy,Popper Power GmbH develops advanced energy sto...,2022,3,Between €500K-€ 1 million,Active,"('https://www.linkedin.com/company/86313916',)",7,8,9.0,6,7,8,5,42,2,0.21,0,0,"['energy storage', 'EV charging', 'infrastruct...",B2B,True,https://www.popperpower.com,True,262.3,
2,Setter AI,https://www.eu-startups.com/directory/setter-ai/,https://www.eu-startups.com/wp-content/uploads...,https://www.trysetter.com,Wien,Vienna,"ai, ai agents, sales & marketing, ai saas, ai ...",Software & Analytics,Speed matters when you want more sales. That’s...,2024,1,Between €1-€100K,Active,(),8,9,7.0,8,6,9,7,305,15,0.2,2,1,"['sales', 'AI', 'integration']",B2B,True,https://www.trysetter.com,True,215.9,
3,SurveySensum,https://www.eu-startups.com/directory/surveyse...,https://www.eu-startups.com/wp-content/uploads...,https://www.surveysensum.com,vienna,Vienna,"customer feedback, customer experience",Software & Analytics,SurveySensum is a leading customer feedback pl...,2018,7,No funding announced yet,Active,('https://www.linkedin.com/company/SurveySensu...,8,9,7.0,8,6,9,7,203,10,0.2,0,0,"['customer feedback', 'CX', 'business goals']",B2B,True,https://www.surveysensum.com,True,839.6,
4,Artypa,https://www.eu-startups.com/directory/artypa/,https://www.eu-startups.com/wp-content/uploads...,https://artypa.com,Vienna,Vienna,"ai content, ai productivity, ai generation",Software & Analytics,Traditional AI workflows often involve navigat...,2024,1,No funding announced yet,Active,(),8,9,7.0,6,5,8,4,335,15,0.2,0,0,"['AI', 'platform', 'efficiency']",B2B,True,https://artypa.com,True,1084.7,


## Website Scraping

**Clean GMBH... etc**

In [41]:
import re

# Liste typischer österreichischer Unternehmensformen inkl. Varianten
austrian_legal_suffixes = [
    r'g\.?m\.?b\.?h\.?', r'ag', r'o\.?g\.?', r'k\.?g\.?', r'e\.?u\.?', 
    r'gmbh\s*&\s*co\s*kg', r'ag\s*&\s*co\s*kg'
]

# Regex-Pattern erstellen
pattern = re.compile(r'\b(' + '|'.join(austrian_legal_suffixes) + r')\b', flags=re.IGNORECASE)

# Bereinigung der Spalte
df_listings['name_cleaned'] = df_listings['name'].str.replace(pattern, '', regex=True).str.strip()

In [44]:
# 1) Rename specific entries in 'name_cleaned'
rename_map = {
    'https://www.branding5.com/': 'Branding 5',
    'FragDasPDF – Simply talk to any PDF': 'FragDasPDF',
    'heyqq - ask away': 'heyqq',
    'prop.ID – simply real estate': 'prop.ID',
    'devjobs.at IT-Recruiting': 'devjobs.at',
    'Uloo.me': 'Uloo',
    'DECIDEA – make good decisions': 'Decidea'
}
df_listings['name_cleaned'] = df_listings['name_cleaned'].replace(rename_map)

# 2) Remove unwanted rows based on 'name_cleaned'
to_remove = {
    'LinkedIn Company Insights by AroundDeal',
    '506',
    'illwerke vkw'
}
df_listings = df_listings[~df_listings['name_cleaned'].isin(to_remove)]

### Start API Requesting for 

- first search results for name
- x
- linkedin 
- instagram

In [85]:
df = df_listings

df.head(3)

Unnamed: 0,name,link_startupeu,link_logo,website,city,region,tags,category,business_description,founded,age,total_funding,company_status,social_links,writing_score,clarity_score,innovativeness_score,market_readiness_score,founder_signal_score,sentiment_score,traction_score,word_count,sentence_count,jargon_density,numeric_evidence_count,call_to_action_flag,top_3_keywords,business_model,website_valid,website_clean,website_up,website_response_ms,redirected_url,name_cleaned
0,Avdain,https://www.eu-startups.com/directory/avdain/,https://www.eu-startups.com/wp-content/uploads...,avdain.com,Vienna,Vienna,"company, startup, one person",Software & Analytics,Avdain is a enterprise that embodies a fusion ...,2020,5,No funding announced yet,Active,(),8,7,9.0,6,8,9,5,290,12,0.25,0,0,"['innovation', 'academic', 'entrepreneurial']",B2B,True,https://avdain.com,True,131.8,,Avdain
1,Popper Power GmbH,https://www.eu-startups.com/directory/popper-p...,https://www.eu-startups.com/wp-content/uploads...,www.popperpower.com,Vienna,Vienna,"ev, battery, bess, charging",Energy,Popper Power GmbH develops advanced energy sto...,2022,3,Between €500K-€ 1 million,Active,"('https://www.linkedin.com/company/86313916',)",7,8,9.0,6,7,8,5,42,2,0.21,0,0,"['energy storage', 'EV charging', 'infrastruct...",B2B,True,https://www.popperpower.com,True,262.3,,Popper Power
2,Setter AI,https://www.eu-startups.com/directory/setter-ai/,https://www.eu-startups.com/wp-content/uploads...,https://www.trysetter.com,Wien,Vienna,"ai, ai agents, sales & marketing, ai saas, ai ...",Software & Analytics,Speed matters when you want more sales. That’s...,2024,1,Between €1-€100K,Active,(),8,9,7.0,8,6,9,7,305,15,0.2,2,1,"['sales', 'AI', 'integration']",B2B,True,https://www.trysetter.com,True,215.9,,Setter AI


In [89]:
import os
import re
import requests
import pandas as pd
from tqdm import tqdm
from dotenv import load_dotenv

# load .env into os.environ
load_dotenv()

API_KEY  = os.getenv("SERPER_DEV_KEY")
HEADERS  = {"X-API-KEY": API_KEY}
ENDPOINT = "https://google.serper.dev/search"
DOMAINS  = {
    "Instagram": "instagram.com",
    "X":         "x.com",
    "LinkedIn":  "linkedin.com/company"
}

def make_safe(name):
    # replace any non-alphanumeric with spaces, collapse multiples
    s = re.sub(r"[^\w\s]", " ", name)
    return re.sub(r"\s+", " ", s).strip()

results = []
for name in tqdm(df['name_cleaned'].dropna().unique(), desc="Startups"):
    row = {"Startup": name}
    safe_name = make_safe(name)

    # 1) Domain-specific lookups (no fallback to general)
    for label, domain in tqdm(DOMAINS.items(), desc=f"Domains for {name}", leave=False):
        params = {
            "q":      f'site:{domain} "{safe_name}"',
            "gl":     "AT",
            "hl":     "de",
            "source": "web"
        }
        resp = requests.get(ENDPOINT, headers=HEADERS, params=params)
        resp.raise_for_status()
        organic = resp.json().get("organic", [])

        # only keep true domain hits; otherwise None
        row[label] = organic[0]["link"] if organic else None

    # 2) One single “General” lookup for the startup name
    params_gen = {
        "q":      safe_name,
        "gl":     "AT",
        "hl":     "de",
        "source": "web"
    }
    resp_gen = requests.get(ENDPOINT, headers=HEADERS, params=params_gen)
    resp_gen.raise_for_status()
    gen_org = resp_gen.json().get("organic", [])
    row["General"] = gen_org[0]["link"] if gen_org else None

    results.append(row)

Startups:   0%|          | 0/515 [00:00<?, ?it/s]

Startups: 100%|██████████| 515/515 [34:46<00:00,  4.05s/it]


In [90]:
out_df = pd.DataFrame(results)

out_df.head(3)

Unnamed: 0,Startup,Instagram,X,LinkedIn,General
0,Avdain,https://www.instagram.com/ns.rabbani/p/BSEJIDS...,https://x.com/awesomeindiehub/status/187271911...,,https://www.avdain.com/our-mission
1,Popper Power,https://www.instagram.com/popperpower/,https://x.com/FranzHoegl/status/18789481448322...,https://at.linkedin.com/company/popperpower,https://www.popperpower.com/?srsltid=AfmBOoqrk...
2,Setter AI,https://www.instagram.com/trysetter/,https://x.com/trysetter,https://www.linkedin.com/company/setterai?trk=...,https://www.trysetter.com/


In [91]:
merged_df = df.merge(out_df, left_on="name_cleaned", right_on="Startup", how="left")
merged_df.head(3)

Unnamed: 0,name,link_startupeu,link_logo,website,city,region,tags,category,business_description,founded,age,total_funding,company_status,social_links,writing_score,clarity_score,innovativeness_score,market_readiness_score,founder_signal_score,sentiment_score,traction_score,word_count,sentence_count,jargon_density,numeric_evidence_count,call_to_action_flag,top_3_keywords,business_model,website_valid,website_clean,website_up,website_response_ms,redirected_url,name_cleaned,Startup,Instagram,X,LinkedIn,General
0,Avdain,https://www.eu-startups.com/directory/avdain/,https://www.eu-startups.com/wp-content/uploads...,avdain.com,Vienna,Vienna,"company, startup, one person",Software & Analytics,Avdain is a enterprise that embodies a fusion ...,2020,5,No funding announced yet,Active,(),8,7,9.0,6,8,9,5,290,12,0.25,0,0,"['innovation', 'academic', 'entrepreneurial']",B2B,True,https://avdain.com,True,131.8,,Avdain,Avdain,https://www.instagram.com/ns.rabbani/p/BSEJIDS...,https://x.com/awesomeindiehub/status/187271911...,,https://www.avdain.com/our-mission
1,Popper Power GmbH,https://www.eu-startups.com/directory/popper-p...,https://www.eu-startups.com/wp-content/uploads...,www.popperpower.com,Vienna,Vienna,"ev, battery, bess, charging",Energy,Popper Power GmbH develops advanced energy sto...,2022,3,Between €500K-€ 1 million,Active,"('https://www.linkedin.com/company/86313916',)",7,8,9.0,6,7,8,5,42,2,0.21,0,0,"['energy storage', 'EV charging', 'infrastruct...",B2B,True,https://www.popperpower.com,True,262.3,,Popper Power,Popper Power,https://www.instagram.com/popperpower/,https://x.com/FranzHoegl/status/18789481448322...,https://at.linkedin.com/company/popperpower,https://www.popperpower.com/?srsltid=AfmBOoqrk...
2,Setter AI,https://www.eu-startups.com/directory/setter-ai/,https://www.eu-startups.com/wp-content/uploads...,https://www.trysetter.com,Wien,Vienna,"ai, ai agents, sales & marketing, ai saas, ai ...",Software & Analytics,Speed matters when you want more sales. That’s...,2024,1,Between €1-€100K,Active,(),8,9,7.0,8,6,9,7,305,15,0.2,2,1,"['sales', 'AI', 'integration']",B2B,True,https://www.trysetter.com,True,215.9,,Setter AI,Setter AI,https://www.instagram.com/trysetter/,https://x.com/trysetter,https://www.linkedin.com/company/setterai?trk=...,https://www.trysetter.com/


In [93]:
merged_df.to_csv("./data/eustartup_listings_enriched_3", index=False)

In [4]:
merged_df = pd.read_csv("./data/eustartup_listings_enriched_3")

#### Process the information

In [15]:
import pandas as pd
from urllib.parse import urlparse

# 1) Normalize to hostname only
def get_host(u):
    """
    Parse URL and return only the hostname (no scheme, no www., no path).
    e.g. "https://www.avdain.com/our-mission" -> "avdain.com"
    """
    if pd.isnull(u):
        return None
    p = urlparse(u)
    host = p.netloc.lower()
    # strip www.
    if host.startswith("www."):
        host = host[4:]
    return host

# 2) Compute host columns
merged_df['website_host'] = merged_df['website_clean'].apply(get_host)
merged_df['general_host'] = merged_df['General'].apply(get_host)

In [30]:
# 3) Filter where they don’t match (including cases where General exists but root differs)
mismatch_df = merged_df[
    merged_df['General'].notnull() &
    (merged_df['website_host'] != merged_df['general_host'])
].copy()

# 4) (Optional) Keep only the cols you care about
mismatch_df = mismatch_df[[
    'name_cleaned',
    "top_3_keywords",
    "website_up",
    'website_root', 
    'general_root'
]]

In [None]:
# Define old→new root mapping
root_mapping = {
    'https://moneycare.at':             'https://moneycare.io',
    'https://logicdev.eu':              'https://logiicdev.eu',
    'https://apichamp.com':             'https://apichap.com',
    'https://goddard-discovery.ml':     'https://goddard-discovery.com',
    'https://yourstyle.app':            'https://your-style.fashion',
    'https://heartbeat-bio.eu':         'https://heartbeat.bio',
    'https://cellectric-biosciences.com':'https://cellectric.com',
    'https://rotable.at':               'https://rotable.de'
}

# Assume merged_df is already in memory
#    If not, load it:
# merged_df = pd.read_csv('/mnt/data/your_merged_df.csv')

# Apply mapping to website_root
merged_df['website_root'] = merged_df['website_root'].replace(root_mapping)

# Promote website_root to website_clean
merged_df['website_clean'] = merged_df['website_root']

# Drop the old website_root column
merged_df = merged_df.drop(columns=['website_root'])

# Update website_up status from False to True for the rows we fixed
new_roots = set(root_mapping.values())
mask_fixed = merged_df['website_clean'].isin(new_roots)
merged_df.loc[mask_fixed, 'website_up'] = True

In [33]:
change_records = []
for old_root, new_root in root_mapping.items():
    # find all startups that now have new_root as their website_clean
    hits = merged_df[merged_df['website_clean'] == new_root]
    for _, row in hits.iterrows():
        change_records.append({
            "Startup":     row['name'],
            "Old Website": old_root,
            "New Website": new_root,
            "website_up":  row['website_up']
        })

changed_df = pd.DataFrame(change_records)

# Display it
print(changed_df.to_string(index=False))

               Startup                        Old Website                   New Website  website_up
            money:care               https://moneycare.at          https://moneycare.io        True
              logicdev                https://logicdev.eu          https://logiicdev.eu        True
              APICHAMP               https://apichamp.com           https://apichap.com        True
   Goddard – Discovery       https://goddard-discovery.ml https://goddard-discovery.com        True
             yourStyle              https://yourstyle.app    https://your-style.fashion        True
         HeartBeat.bio           https://heartbeat-bio.eu         https://heartbeat.bio        True
CellEctric Biosciences https://cellectric-biosciences.com        https://cellectric.com        True
  rotable technologies                 https://rotable.at            https://rotable.de        True


In [37]:
merged_df.to_csv("./data/eustartup_listings_enriched_3.csv", index=False)

In [38]:
merged_df

Unnamed: 0,name,link_startupeu,link_logo,website,city,region,tags,category,business_description,founded,age,total_funding,company_status,social_links,writing_score,clarity_score,innovativeness_score,market_readiness_score,founder_signal_score,sentiment_score,traction_score,word_count,sentence_count,jargon_density,numeric_evidence_count,call_to_action_flag,top_3_keywords,business_model,website_valid,website_clean,website_up,website_response_ms,redirected_url,name_cleaned,Startup,Instagram,X,LinkedIn,General,general_root,website_host,general_host
0,Avdain,https://www.eu-startups.com/directory/avdain/,https://www.eu-startups.com/wp-content/uploads...,avdain.com,Vienna,Vienna,"company, startup, one person",Software & Analytics,Avdain is a enterprise that embodies a fusion ...,2020,5,No funding announced yet,Active,(),8,7,9.0,6,8,9,5,290,12,0.25,0,0,"['innovation', 'academic', 'entrepreneurial']",B2B,True,https://avdain.com,True,131.8,,Avdain,Avdain,https://www.instagram.com/ns.rabbani/p/BSEJIDS...,https://x.com/awesomeindiehub/status/187271911...,,https://www.avdain.com/our-mission,https://avdain.com,avdain.com,avdain.com
1,Popper Power GmbH,https://www.eu-startups.com/directory/popper-p...,https://www.eu-startups.com/wp-content/uploads...,www.popperpower.com,Vienna,Vienna,"ev, battery, bess, charging",Energy,Popper Power GmbH develops advanced energy sto...,2022,3,Between €500K-€ 1 million,Active,"('https://www.linkedin.com/company/86313916',)",7,8,9.0,6,7,8,5,42,2,0.21,0,0,"['energy storage', 'EV charging', 'infrastruct...",B2B,True,https://popperpower.com,True,262.3,,Popper Power,Popper Power,https://www.instagram.com/popperpower/,https://x.com/FranzHoegl/status/18789481448322...,https://at.linkedin.com/company/popperpower,https://www.popperpower.com/?srsltid=AfmBOoqrk...,https://popperpower.com,popperpower.com,popperpower.com
2,Setter AI,https://www.eu-startups.com/directory/setter-ai/,https://www.eu-startups.com/wp-content/uploads...,https://www.trysetter.com,Wien,Vienna,"ai, ai agents, sales & marketing, ai saas, ai ...",Software & Analytics,Speed matters when you want more sales. That’s...,2024,1,Between €1-€100K,Active,(),8,9,7.0,8,6,9,7,305,15,0.2,2,1,"['sales', 'AI', 'integration']",B2B,True,https://trysetter.com,True,215.9,,Setter AI,Setter AI,https://www.instagram.com/trysetter/,https://x.com/trysetter,https://www.linkedin.com/company/setterai?trk=...,https://www.trysetter.com/,https://trysetter.com,trysetter.com,trysetter.com
3,SurveySensum,https://www.eu-startups.com/directory/surveyse...,https://www.eu-startups.com/wp-content/uploads...,https://www.surveysensum.com,vienna,Vienna,"customer feedback, customer experience",Software & Analytics,SurveySensum is a leading customer feedback pl...,2018,7,No funding announced yet,Active,('https://www.linkedin.com/company/SurveySensu...,8,9,7.0,8,6,9,7,203,10,0.2,0,0,"['customer feedback', 'CX', 'business goals']",B2B,True,https://surveysensum.com,True,839.6,,SurveySensum,SurveySensum,https://www.instagram.com/surveysensum/,https://x.com/surveysensum,https://www.linkedin.com/company/surveysensum,https://www.surveysensum.com/,https://surveysensum.com,surveysensum.com,surveysensum.com
4,Artypa,https://www.eu-startups.com/directory/artypa/,https://www.eu-startups.com/wp-content/uploads...,https://artypa.com,Vienna,Vienna,"ai content, ai productivity, ai generation",Software & Analytics,Traditional AI workflows often involve navigat...,2024,1,No funding announced yet,Active,(),8,9,7.0,6,5,8,4,335,15,0.2,0,0,"['AI', 'platform', 'efficiency']",B2B,True,https://artypa.com,True,1084.7,,Artypa,Artypa,https://www.instagram.com/artypa/,https://x.com/MicroLaunchHQ/status/18729305332...,,https://artypa.com/,https://artypa.com,artypa.com,artypa.com
5,Podpally,https://www.eu-startups.com/directory/podpally/,https://www.eu-startups.com/wp-content/uploads...,https://www.podpally.com/,Tyrol,Tyrol,"podcast preparation, podcast research, topic r...",Software & Analytics,PodPally – Your Podcast’s New Best Friend Say ...,2024,1,No funding announced yet,Active,(),8,9,7.0,6,5,9,4,139,7,0.14,0,1,"['podcast', 'research', 'prep']",B2B,True,https://podpally.com,True,139.8,,Podpally,Podpally,,https://x.com/PodPallyAI/status/18858095501350...,,https://www.podpally.com/,https://podpally.com,podpally.com,podpally.com
6,Wedding Capsule,https://www.eu-startups.com/directory/wedding-...,https://www.eu-startups.com/wp-content/uploads...,https://weddingcapsule.net/,Lochau,Vorarlberg,"wedding & event planning, lifestyle, productiv...",Software & Analytics,Wedding Capsule is a modern digital guestbook ...,2024,1,No funding announced yet,Active,(),8,9,8.0,7,6,9,5,392,20,0.1,0,1,"['guestbook', 'memories', 'wedding']",B2C,True,https://weddingcapsule.net,True,109.4,,Wedding Capsule,Wedding Capsule,https://www.instagram.com/viktoriaundviktor_we...,https://x.com/Wedding_Capsule,,https://weddingcapsule.net/?srsltid=AfmBOooHDf...,https://weddingcapsule.net,weddingcapsule.net,weddingcapsule.net
7,Hyrox Training Plans,https://www.eu-startups.com/directory/hyrox-tr...,https://www.eu-startups.com/wp-content/uploads...,https://www.hyroxtrainingplans.com/,Ober-wolfsbach,Lower Austria,"fitness, health & wellness, hyrox training",Health,HyroxTrainingPlans.com is a comprehensive dire...,2024,1,No funding announced yet,Active,(),8,9,7.0,6,5,8,4,104,6,0.15,0,0,"['training plans', 'Hyrox', 'athletes']",platform,True,https://hyroxtrainingplans.com,True,596.3,,Hyrox Training Plans,Hyrox Training Plans,https://www.instagram.com/hyroxtrainingplans/,,https://uk.linkedin.com/company/mycoachapp,https://www.puregym.com/blog/hyrox-training-plan/,https://puregym.com,hyroxtrainingplans.com,puregym.com
8,506.ai,https://www.eu-startups.com/directory/506ai/,https://www.eu-startups.com/wp-content/uploads...,"506,ai",Linz,Upper Austria,"ai, process, automation",Software & Analytics,506 CompanyGPT is the next-generation AI proce...,2020,5,Between €1 million-€ 2.5 million,Active,"('https://www.linkedin.com/in/gerhardkuerner/',)",8,9,7.0,6,5,8,4,116,6,0.25,2,0,"['AI', 'automation', 'enterprise']",B2B,True,https://506.ai,True,1204.9,,506.ai,506.ai,https://www.instagram.com/506ai_/,https://x.com/506ai_?lang=vi,https://at.linkedin.com/company/506-ai,https://www.506.ai/,https://506.ai,506.ai,506.ai
9,Get Worksheet,https://www.eu-startups.com/directory/get-work...,https://www.eu-startups.com/wp-content/uploads...,https://getworksheet.co,Vienna,Vienna,"education, artificial intelligence",Education,GetWorksheet.co is a simple tool for teachers ...,2023,2,Between €1-€100K,Active,(),8,9,6.0,7,5,8,4,85,6,0.1,0,0,"['teachers', 'worksheets', 'customize']",B2B,True,https://getworksheet.co,True,142.6,,Get Worksheet,Get Worksheet,https://www.instagram.com/branchtobloom/reel/C...,https://x.com/vidyacademy?lang=sr,,https://getworksheet.co/index.html,https://getworksheet.co,getworksheet.co,getworksheet.co
