# 3.2 Enrich dataset

In [1]:
import pandas as pd

In [6]:
# show *all* rows and columns
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

# widen the “page” so it won’t wrap columns  
pd.set_option('display.width', 1000)

In [15]:
df_listings = pd.read_csv("./data/eustartup_listings_enriched_1.csv")

df_listings = df_listings.drop(columns=['innovativness_score'])

df_listings.head(5)

Unnamed: 0,name,link_startupeu,link_logo,website,city,region,tags,category,business_description,founded,age,total_funding,company_status,social_links,writing_score,clarity_score,innovativeness_score,market_readiness_score,founder_signal_score,sentiment_score,traction_score,word_count,sentence_count,jargon_density,numeric_evidence_count,call_to_action_flag,top_3_keywords,business_model
0,Avdain,https://www.eu-startups.com/directory/avdain/,https://www.eu-startups.com/wp-content/uploads...,avdain.com,Vienna,Vienna,"company, startup, one person",Software & Analytics,Avdain is a enterprise that embodies a fusion ...,2020,5,No funding announced yet,Active,(),8,7,9.0,6,8,9,5,290,12,0.25,0,0,"['innovation', 'academic', 'entrepreneurial']",B2B
1,Popper Power GmbH,https://www.eu-startups.com/directory/popper-p...,https://www.eu-startups.com/wp-content/uploads...,www.popperpower.com,Vienna,Vienna,"ev, battery, bess, charging",Energy,Popper Power GmbH develops advanced energy sto...,2022,3,Between €500K-€ 1 million,Active,"('https://www.linkedin.com/company/86313916',)",7,8,9.0,6,7,8,5,42,2,0.21,0,0,"['energy storage', 'EV charging', 'infrastruct...",B2B
2,Setter AI,https://www.eu-startups.com/directory/setter-ai/,https://www.eu-startups.com/wp-content/uploads...,https://www.trysetter.com,Wien,Vienna,"ai, ai agents, sales & marketing, ai saas, ai ...",Software & Analytics,Speed matters when you want more sales. That’s...,2024,1,Between €1-€100K,Active,(),8,9,7.0,8,6,9,7,305,15,0.2,2,1,"['sales', 'AI', 'integration']",B2B
3,SurveySensum,https://www.eu-startups.com/directory/surveyse...,https://www.eu-startups.com/wp-content/uploads...,https://www.surveysensum.com,vienna,Vienna,"customer feedback, customer experience",Software & Analytics,SurveySensum is a leading customer feedback pl...,2018,7,No funding announced yet,Active,('https://www.linkedin.com/company/SurveySensu...,8,9,7.0,8,6,9,7,203,10,0.2,0,0,"['customer feedback', 'CX', 'business goals']",B2B
4,Artypa,https://www.eu-startups.com/directory/artypa/,https://www.eu-startups.com/wp-content/uploads...,https://artypa.com,Vienna,Vienna,"ai content, ai productivity, ai generation",Software & Analytics,Traditional AI workflows often involve navigat...,2024,1,No funding announced yet,Active,(),8,9,7.0,6,5,8,4,335,15,0.2,0,0,"['AI', 'platform', 'efficiency']",B2B


In [16]:
columns = df_listings.columns.to_list()

for column in columns:
    print(column)

name
link_startupeu
link_logo
website
city
region
tags
category
business_description
founded
age
total_funding
company_status
social_links
writing_score
clarity_score
innovativeness_score
market_readiness_score
founder_signal_score
sentiment_score
traction_score
word_count
sentence_count
jargon_density
numeric_evidence_count
call_to_action_flag
top_3_keywords
business_model


In [17]:
df_with_nans = df_listings[df_listings.isna().any(axis=1)]

# Count NaNs in each column of df_with_nans
nan_counts = df_with_nans.isna().sum()

print(nan_counts)


name                        0
link_startupeu              0
link_logo                   0
website                     0
city                        0
region                      0
tags                        0
category                    0
business_description        0
founded                     0
age                         0
total_funding               0
company_status            274
social_links                0
writing_score               0
clarity_score               0
innovativeness_score      123
market_readiness_score      0
founder_signal_score        0
sentiment_score             0
traction_score              0
word_count                  0
sentence_count              0
jargon_density              0
numeric_evidence_count      0
call_to_action_flag         0
top_3_keywords              0
business_model              0
dtype: int64


## Enrich dataset with Website Data

### Check website data

In [23]:
import pandas as pd
from urllib.parse import urlparse
import re

# 1. A helper to test validity
def is_valid_url(url: str) -> bool:
    try:
        p = urlparse(url)
        return p.scheme in ("http", "https") and bool(p.netloc)
    except:
        return False

# Identify the invalid URLs in their original form
df_invalid_before = df_listings[~df_listings["website"].apply(is_valid_url)]

# Show a few examples
print(df_invalid_before[["name", "website"]].head(10))

                   name                website
0                Avdain             avdain.com
1     Popper Power GmbH    www.popperpower.com
8                506.ai                 506,ai
11           DAIKI GmbH             www.dai.ki
18             IDCanopy       www.idcanopy.com
20       Permar AI Inc.          www.permar.ai
74         PassionClass    www.passionclass.co
78   Feel Forever Young  www.fancybynature.com
86                Godot      godot-austria.com
104      myCulture GmbH      www.myculture.app


In [24]:
# A cleaning function to:
# replace commas with dots (“506,ai”→“506.ai”)
# strip whitespace
# add “https://” if no scheme present
# remove trailing slashes
def clean_url(u: str) -> str:
    if pd.isna(u) or not isinstance(u, str):
        return u
    u = u.strip()
    # swap commas for dots
    u = u.replace(",", ".")
    # add scheme if missing
    if not re.match(r"^https?://", u, flags=re.IGNORECASE):
        u = "https://" + u
    # drop trailing slash
    u = u.rstrip("/")
    return u


# Apply to DataFrame
df_listings["website_clean"] = df_listings["website"].apply(clean_url)

# Re-validate
df_listings["website_valid"] = df_listings["website_clean"].apply(is_valid_url)

# Inspect the still-invalid ones
df_still_bad = df_listings[~df_listings["website_valid"]]
print(f"Still invalid URLs: {len(df_still_bad)}")
print(df_still_bad[["name","website","website_clean"]])

Still invalid URLs: 0
Empty DataFrame
Columns: [name, website, website_clean]
Index: []


### Check Website Availability

In [27]:
import time
import requests
import pandas as pd
from tqdm.auto import tqdm

# using the cleaned URLs
urls = df_listings["website_clean"]

# Prepare result lists
website_up = []
website_response_ms = []

# Loop with a progress bar
for url in tqdm(urls, desc="Checking website availability"):
    try:
        start = time.time()
        resp = requests.head(url, timeout=5, allow_redirects=True)
        elapsed = (time.time() - start) * 100  # ms
        website_up.append(resp.status_code == 200)
        website_response_ms.append(round(elapsed, 1))
    except Exception:
        website_up.append(False)
        website_response_ms.append(None)

# Assign back to DataFrame
df_listings["website_up"] = website_up
df_listings["website_response_ms"] = website_response_ms

# Quick check
print(df_listings[["website_clean", "website_up", "website_response_ms"]].head())

  from .autonotebook import tqdm as notebook_tqdm
Checking website availability: 100%|██████████| 518/518 [06:09<00:00,  1.40it/s]

                  website_clean  website_up  website_response_ms
0            https://avdain.com        True                131.8
1   https://www.popperpower.com        True                262.3
2     https://www.trysetter.com        True                215.9
3  https://www.surveysensum.com        True                839.6
4            https://artypa.com        True               1084.7





In [29]:
# Filter to only the rows where website_up is False
df_down = df_listings[df_listings["website_up"] == False]

# Quick summary
print(f"Found {len(df_down)} listings whose site is down:")

# See which ones
print(df_down[["name", "website_clean", "website_up"]])

Found 112 listings whose site is down:
                               name                            website_clean  website_up
24                       jeanszilla                   https://jeanszilla.com       False
35                         logicdev                  https://www.logicdev.eu       False
58                             BeVi                       https://beviai.com       False
65                        Whalecard                     https://whalecard.co       False
67                         byeagain                  https://www.byeagain.at       False
80                         Timeular                     https://timeular.com       False
83                           HeyBob                       https://heybob.app       False
84                          Whistle               https://www.usewhistle.com       False
85                        alwritely                https://www.alwritely.com       False
90                   Destination360                https://destination3

**Manually fix some mal entries**

In [31]:
# List of startup “name” values for which we want to force website_up=True
names_to_fix = [
    "Point of New",
    "INNIO Group",
    "Pocket Cocktails",
    "Codeversity",
    "Wowflow",
    "CoLivi",
    "Coders.Bay",
    "VIPASO",
    "VARS",
    "devjobs.at IT-Recruiting",
    "Mursall Active Coating",
    "NOVEM Gold",
    "Green Cloud Nine",
    "DrainBot",
    "Seasy GmbH",
    "25superstars",
    "Noreja",
    "Surfvacationer",
    "consola.finance",
    "alwritely",
    "Timeular",
    "byeagain",
    "BeVi",
    "jeanszilla"
]

# Force website_up=True for those names
df_listings.loc[
    df_listings["name"].isin(names_to_fix),
    "website_up"
] = True

# Verify the change
fixed = df_listings[df_listings["name"].isin(names_to_fix)]
print(f"Forced up for {len(fixed)} entries:")
print(fixed[["name", "website_clean", "website_up"]])

Forced up for 24 entries:
                         name                            website_clean  website_up
24                 jeanszilla                   https://jeanszilla.com        True
58                       BeVi                       https://beviai.com        True
67                   byeagain                  https://www.byeagain.at        True
80                   Timeular                     https://timeular.com        True
85                  alwritely                https://www.alwritely.com        True
100           consola.finance                  https://consola.finance        True
137            Surfvacationer               https://surfvacationer.com        True
212                    Noreja                       https://noreja.com        True
219              25superstars                 https://25superstars.com        True
228                Seasy GmbH                     https://www.seasy.at        True
236                  DrainBot                  https://www.dr

**Check some redirects**

In [32]:
import requests
from tqdm.auto import tqdm

# 1) Prepare a column to hold the redirect target
df_listings["redirected_url"] = None

# 2) Filter to the “down” sites
mask_down = df_listings["website_up"] == False

# 3) Iterate and check with redirects
for idx in tqdm(df_listings[mask_down].index, desc="Re-checking down sites"):
    original = df_listings.at[idx, "website_clean"]
    try:
        resp = requests.get(original, timeout=5, allow_redirects=True)
        final = resp.url.rstrip("/")
        # If we got a 200, mark it as up
        df_listings.at[idx, "website_up"] = (resp.status_code == 200)
        # If final URL differs, record it
        if final.lower() != original.rstrip("/").lower():
            df_listings.at[idx, "redirected_url"] = final
    except Exception:
        # leave website_up False and redirected_url None
        pass

# 4) Inspect the changes
fixed = df_listings[df_listings["redirected_url"].notna()]
print(f"Found {len(fixed)} sites that redirected and are now marked up:\n")
print(fixed[["name","website_clean","website_up","redirected_url"]])

Re-checking down sites: 100%|██████████| 88/88 [01:41<00:00,  1.16s/it]

Found 6 sites that redirected and are now marked up:

                     name                 website_clean  website_up                redirected_url
147  Celeris Therapeutics     https://www.celeristx.com       False        https://adhesiontx.com
170             Klarsicht  https://www.klarsicht.online       False      https://klarsicht.online
317          Secondra.com          https://secondra.com       False      https://www.secondra.com
440               Progeny      https://www.progeny.tech       False  https://totmtechnologies.com
495                7LYTIX        https://www.7lytix.com        True     https://www.7lytix.com/de
513             Ease-Link           http://easelink.com       False          https://easelink.com





In [33]:
# 1) Identify rows matching “easelink.com”
mask = df_listings["website_clean"].str.contains("easelink.com", case=False, na=False)

# 2) Force them up and record the redirect
df_listings.loc[mask, "website_up"] = True
df_listings.loc[mask, "redirected_url"] = "https://easelink.com"

# 3) Verify
print(df_listings.loc[mask, ["name","website_clean","website_up","redirected_url"]])

          name        website_clean  website_up        redirected_url
513  Ease-Link  http://easelink.com        True  https://easelink.com


In [36]:
# Filter to rows where website_up is False
df_down = df_listings[df_listings["website_up"] == False]

# Print the relevant columns
print(df_down[["name", "city", "website_clean", "website_up"]])

                               name                    city                           website_clean  website_up
35                         logicdev                    Graz                 https://www.logicdev.eu       False
65                        Whalecard                  Vienna                    https://whalecard.co       False
83                           HeyBob                  Vienna                      https://heybob.app       False
84                          Whistle                   Viena              https://www.usewhistle.com       False
90                   Destination360                    Graz               https://destination360.io       False
103                    Add to Water                  Vienna                  https://addtowater.com       False
105                        And-Less                    Wien                     https://and-less.at       False
111                            nuvo                  Vienna                        https://nuvo.cam     

In [37]:
import requests
import pandas as pd
from tqdm.auto import tqdm

# assume df_listings is your main DataFrame and you already have website_clean + website_up
# 1) Prepare a column to hold eventual redirects
df_listings["redirected_url"] = None

# 2) Find all indices still marked down
down_idx = df_listings.index[~df_listings["website_up"]]

# 3) Loop and follow redirects
for i in tqdm(down_idx, desc="Re-checking failed sites"):
    orig = df_listings.at[i, "website_clean"]
    try:
        resp = requests.get(orig, timeout=5, allow_redirects=True)
        final_url = resp.url.rstrip("/")
        if resp.status_code == 200:
            # mark as up
            df_listings.at[i, "website_up"] = True
            # store the redirect only if different
            if final_url.lower() != orig.rstrip("/").lower():
                df_listings.at[i, "redirected_url"] = final_url
    except requests.RequestException:
        # still down
        pass

# 4) Inspect results
fixed = df_listings[df_listings["website_up"] & df_listings["redirected_url"].notna()]
print(f"Fixed {len(fixed)} entries via redirect:\n")
print(fixed[["name","website_clean","redirected_url"]])


Re-checking failed sites: 100%|██████████| 81/81 [01:36<00:00,  1.19s/it]

Fixed 0 entries via redirect:

Empty DataFrame
Columns: [name, website_clean, redirected_url]
Index: []





In [66]:
df_listings = pd.read_csv("./data/eustartup_listings_enriched_2.csv")

In [70]:
# 1) Re-create the individual boolean masks
urls_to_flip_false = [
    "https://www.liquidary.com",
    "http://www.quantego.com",
    "https://cnqrclub.com",
]
norm_flip = [u.rstrip("/").replace("http://", "https://").lower() for u in urls_to_flip_false]
mask_flip = df_listings["website_clean"].str.rstrip("/").str.lower().isin(norm_flip)

mask_heal_old = df_listings["website_clean"].str.contains("healcloud.com", case=False, na=False)
mask_heal_new = df_listings["website_clean"].str.contains("healcloud.tech", case=False, na=False)

# 2) Combine them into one boolean mask
mask_changed = mask_flip | mask_heal_old | mask_heal_new

# 3) Print the affected rows
print(df_listings.loc[mask_changed, ["name", "website_clean", "website_up"]])


              name              website_clean  website_up
399  Liquidary.com  https://www.liquidary.com       False
416       CNQRCLUB       https://cnqrclub.com       False
514      Healcloud     https://healcloud.tech        True


In [71]:
df_listings.to_csv("./data/eustartup_listings_enriched_2.csv", index=False)