## Machine Learning Model to Predict Startup Status

In [1]:
import pandas as pd

In [2]:
# show *all* rows and columns
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

# widen the “page” so it won’t wrap columns  
pd.set_option('display.width', 1000)

In [4]:
startups = pd.read_csv("/Users/michaelbosch/Desktop/DS Projects/ds_startups/webscraping/data/startup_df.csv")

startups.head(3)

Unnamed: 0,name,link_startupeu,link_logo,link_website,website_up,city,region,category,top_3_keywords,business_model,linkedin_global_profile,linkedin_handle,linkedin_valid,instagram_handle,instagram_valid,x_handle,x_valid,founded_year,age,total_funding_listing,headline_count,headlines,headline_financial_count,headline_funding,headline_acquisition_count,headline_bankruptcy_count,headline_non_financial_count,description_writing_score,description_market_readiness_score,description_founder_signal_score,description_word_count,description_jargon_density,description_numeric_evidence_count,status
0,Avdain,https://www.eu-startups.com/directory/avdain/,https://www.eu-startups.com/wp-content/uploads...,avdain.com,True,Vienna,Vienna,Software & Analytics,"['innovation', 'academic', 'entrepreneurial']",B2B,False,,False,,False,,False,2020,5,No funding announced yet,0,[],0,0,0,0,0,8,6,8,290,0.25,0,no_funding
1,SurveySensum,https://www.eu-startups.com/directory/surveyse...,https://www.eu-startups.com/wp-content/uploads...,surveysensum.com,True,vienna,Vienna,Software & Analytics,"['customer feedback', 'CX', 'business goals']",B2B,True,surveysensum,True,surveysensum,True,surveysensum,True,2018,7,No funding announced yet,0,[],0,0,0,0,0,8,8,6,203,0.2,0,no_funding
2,Artypa,https://www.eu-startups.com/directory/artypa/,https://www.eu-startups.com/wp-content/uploads...,artypa.com,True,Vienna,Vienna,Software & Analytics,"['AI', 'platform', 'efficiency']",B2B,False,,False,artypa,True,,False,2024,1,No funding announced yet,0,[],0,0,0,0,0,8,6,5,335,0.2,0,no_funding


In [5]:
for col in startups.columns:
    print(col)

name
link_startupeu
link_logo
link_website
website_up
city
region
category
top_3_keywords
business_model
linkedin_global_profile
linkedin_handle
linkedin_valid
instagram_handle
instagram_valid
x_handle
x_valid
founded_year
age
total_funding_listing
headline_count
headlines
headline_financial_count
headline_funding
headline_acquisition_count
headline_bankruptcy_count
headline_non_financial_count
description_writing_score
description_market_readiness_score
description_founder_signal_score
description_word_count
description_jargon_density
description_numeric_evidence_count
status


In [6]:
top10_data = {}

for col in startups.columns:
    # include NaNs, then take the top 10
    vc = startups[col] \
         .value_counts(dropna=False) \
         .head(10)

    # (optional) relabel the NaN index to "<Missing>"
    vc.index = vc.index.where(~vc.index.isna(), '<Missing>')

    top_values = vc.index.tolist()
    top_counts = vc.values.tolist()

    top10_data[col]       = pd.Series(top_values)
    top10_data[f"{col}_count"] = pd.Series(top_counts)

top10_df = pd.DataFrame(top10_data)

display(top10_df)

Unnamed: 0,name,name_count,link_startupeu,link_startupeu_count,link_logo,link_logo_count,link_website,link_website_count,website_up,website_up_count,city,city_count,region,region_count,category,category_count,top_3_keywords,top_3_keywords_count,business_model,business_model_count,linkedin_global_profile,linkedin_global_profile_count,linkedin_handle,linkedin_handle_count,linkedin_valid,linkedin_valid_count,instagram_handle,instagram_handle_count,instagram_valid,instagram_valid_count,x_handle,x_handle_count,x_valid,x_valid_count,founded_year,founded_year_count,age,age_count,total_funding_listing,total_funding_listing_count,headline_count,headline_count_count,headlines,headlines_count,headline_financial_count,headline_financial_count_count,headline_funding,headline_funding_count,headline_acquisition_count,headline_acquisition_count_count,headline_bankruptcy_count,headline_bankruptcy_count_count,headline_non_financial_count,headline_non_financial_count_count,description_writing_score,description_writing_score_count,description_market_readiness_score,description_market_readiness_score_count,description_founder_signal_score,description_founder_signal_score_count,description_word_count,description_word_count_count,description_jargon_density,description_jargon_density_count,description_numeric_evidence_count,description_numeric_evidence_count_count,status,status_count
0,Avdain,1,https://www.eu-startups.com/directory/avdain/,1,https://www.eu-startups.com/wp-content/uploads...,1,<Missing>,62,True,326.0,Vienna,231,Vienna,246.0,Software & Analytics,194,"['Machine Learning', 'retail', 'demand forecas...",2,B2B,245.0,False,374.0,<Missing>,117,True,271.0,<Missing>,156,True,232.0,<Missing>,243,False,242.0,2020,102,5,102,No funding announced yet,296.0,0,138,[],138,0,211,0,269,0.0,360.0,0.0,381.0,0.0,216.0,7.0,240.0,6.0,155.0,5.0,157.0,56,42,0.25,171,0.0,305.0,no_funding,215.0
1,World Data Lab,1,https://www.eu-startups.com/directory/world-da...,1,https://www.eu-startups.com/wp-content/uploads...,1,avdain.com,1,False,62.0,Linz,33,Upper Austria,52.0,Professional Services,45,"['innovation', 'academic', 'entrepreneurial']",1,B2C,100.0,True,14.0,sprad-io,1,False,117.0,trueliveofficial,1,False,156.0,somareality,1,True,146.0,2019,99,6,99,Between €1 million-€ 2.5 million,27.0,1,60,['LR Achleitner: Programmierschmiede Coders.Ba...,1,1,39,1000000,12,1.0,19.0,2.0,3.0,1.0,74.0,8.0,98.0,5.0,131.0,6.0,120.0,66,37,0.2,82,1.0,62.0,funding,111.0
2,Rebel Meat,1,https://www.eu-startups.com/directory/rebel-meat/,1,https://www.eu-startups.com/wp-content/uploads...,1,quantego.com,1,,,Graz,23,Styria,29.0,Health,42,"['cloud directory', 'identity access managemen...",1,platform,16.0,,,oumaigou,1,,,interactivepaper,1,,,quantegoai,1,,,2021,60,4,60,Between €100K-€500K,23.0,2,43,['Wiener Startup Sprad.io will Mitarbeiter zu ...,1,5,38,600000,8,2.0,5.0,1.0,3.0,2.0,40.0,6.0,47.0,7.0,53.0,7.0,61.0,104,24,0.15,27,2.0,13.0,inactive,62.0
3,Rosterize,1,https://www.eu-startups.com/directory/rosterize/,1,https://www.eu-startups.com/wp-content/uploads...,1,runple.com,1,,,Wien,7,Lower Austria,20.0,FinTech/InsurTech,23,"['AI-assisted', 'aircrew planning', 'online ai...",1,B2B2C,15.0,,,qapture,1,,,localrydes,1,,,novasign_gmbh,1,,,2022,31,3,31,Between €2.5 million-5 million,12.0,3,34,['KI zur Selbstanwendung: SkinScreener: So tre...,1,3,34,100000,7,5.0,2.0,5.0,1.0,3.0,31.0,5.0,3.0,4.0,24.0,4.0,28.0,83,19,0.1,24,3.0,5.0,,
4,Runple,1,https://www.eu-startups.com/directory/runple/,1,https://www.eu-startups.com/wp-content/uploads...,1,seamox.com,1,,,Innsbruck,7,Tyrol,17.0,Media & Entertainment,15,"['ERP', 'retail', 'e-commerce']",1,marketplace,10.0,,,rosterize,1,,,digando_com,1,,,flightlevelsac,1,,,2018,22,7,22,Between €500K-€ 1 million,11.0,5,23,['Wiener Startup Rebel Meat von St. Pöltner Ti...,1,2,31,500000,6,3.0,1.0,,,4.0,12.0,,,8.0,21.0,8.0,14.0,85,19,0.14,18,4.0,2.0,,
5,SeaMoX Information Technologies,1,https://www.eu-startups.com/directory/seamox-i...,1,https://www.eu-startups.com/wp-content/uploads...,1,skinscreener.com,1,,,Salzburg,5,Salzburg,8.0,Other,10,"['software', 'custom', 'IT-Services']",1,other,2.0,,,runple,1,,,zerolensphoto,1,,,dreamwaves_io,1,,,2017,21,8,21,Between €5 million-€10 million,9.0,4,21,['Entzifferungssoftware für alte Handschriften...,1,4,19,6000000,5,4.0,1.0,,,5.0,8.0,,,3.0,3.0,9.0,4.0,134,18,0.0,9,5.0,1.0,,
6,SkinScreener,1,https://www.eu-startups.com/directory/skinscre...,1,https://www.eu-startups.com/wp-content/uploads...,1,sprad.io,1,,,vienna,3,Vorarlberg,7.0,Hardware,10,"['skin cancer', 'risk assessment', 'AI']",1,,,,,seamox,1,,,celantur_com,1,,,chatcloud,1,,,2023,19,2,19,Between €1-€100K,7.0,7,18,['qapture: Linzer Startup digitalisiert IKEA u...,1,6,10,2000000,4,,,,,6.0,6.0,,,9.0,1.0,3.0,4.0,108,16,0.3,8,,,,
7,Sprad,1,https://www.eu-startups.com/directory/sprad/,1,https://www.eu-startups.com/wp-content/uploads...,1,sproof.io,1,,,Dornbirn,3,Carinthia,6.0,AgTech/FoodTech,9,"['employee referrals', 'recruiting software', ...",1,,,,,skinscreener,1,,,mymeetfox,1,,,investbrickwise,1,,,2024,15,1,15,Between €10 million-€25 million,2.0,6,18,"['NEXTPART Security Intelligence', 'Tereza Bin...",1,7,4,2500000,4,,,,,7.0,1.0,,,,,,,174,15,0.5,8,,,,
8,sproof,1,https://www.eu-startups.com/directory/sproof/,1,https://www.eu-startups.com/wp-content/uploads...,1,getstaymate.com,1,,,Klagenfurt,3,Burgenland,3.0,Mobility,9,"['document signing', 'digital signature', 'tru...",1,,,,,sproof,1,,,worlddatalab,1,,,bird_shades,1,,,2015,11,10,11,Above €25 million,1.0,8,18,"['MADiscover: 350.000 Euro für Startup, das mi...",1,8,1,2200000,4,,,,,,,,,,,,,42,14,0.4,7,,,,
9,Staymate,1,https://www.eu-startups.com/directory/staymate/,1,https://www.eu-startups.com/wp-content/uploads...,1,storeroom.at,1,,,Wiener Neustadt,2,,,Education,6,"['guest communication', 'digital hotel assista...",1,,,,,surveysensum,1,,,woodspacestudio,1,,,aheadbio,1,,,2016,8,9,8,,,9,11,['Wiener Startup Legitary bietet KI-gestützte ...,1,10,1,4500000,3,,,,,,,,,,,,,92,13,0.21,4,,,,


**Drop irrelevant columns**

In [8]:
keep_cols = [
    "name",
    "link_startupeu",
    "link_website",
    "city",
    "region",
    "founded_year",
    "category",
    "top_3_keywords",
    "business_model",
    "linkedin_handle",
    "total_funding_listing",
    "headline_count",
    "headline_non_financial_count",
    "headline_financial_count",
    "status"
]

# Subset your DataFrame (named `listing`) to only these columns
df_final = startups[keep_cols].copy()

In [9]:
top10_data = {}

for col in df_final.columns:
    # include NaNs, then take the top 10
    vc = df_final[col] \
         .value_counts(dropna=False) \
         .head(10)

    # (optional) relabel the NaN index to "<Missing>"
    vc.index = vc.index.where(~vc.index.isna(), '<Missing>')

    top_values = vc.index.tolist()
    top_counts = vc.values.tolist()

    top10_data[col]       = pd.Series(top_values)
    top10_data[f"{col}_count"] = pd.Series(top_counts)

top10_df = pd.DataFrame(top10_data)

display(top10_df)

Unnamed: 0,name,name_count,link_startupeu,link_startupeu_count,link_website,link_website_count,city,city_count,region,region_count,founded_year,founded_year_count,category,category_count,top_3_keywords,top_3_keywords_count,business_model,business_model_count,linkedin_handle,linkedin_handle_count,total_funding_listing,total_funding_listing_count,headline_count,headline_count_count,headline_non_financial_count,headline_non_financial_count_count,headline_financial_count,headline_financial_count_count,status,status_count
0,Avdain,1,https://www.eu-startups.com/directory/avdain/,1,<Missing>,62,Vienna,231,Vienna,246.0,2020,102,Software & Analytics,194,"['Machine Learning', 'retail', 'demand forecas...",2,B2B,245.0,<Missing>,117,No funding announced yet,296.0,0,138,0.0,216.0,0,211,no_funding,215.0
1,World Data Lab,1,https://www.eu-startups.com/directory/world-da...,1,avdain.com,1,Linz,33,Upper Austria,52.0,2019,99,Professional Services,45,"['innovation', 'academic', 'entrepreneurial']",1,B2C,100.0,sprad-io,1,Between €1 million-€ 2.5 million,27.0,1,60,1.0,74.0,1,39,funding,111.0
2,Rebel Meat,1,https://www.eu-startups.com/directory/rebel-meat/,1,quantego.com,1,Graz,23,Styria,29.0,2021,60,Health,42,"['cloud directory', 'identity access managemen...",1,platform,16.0,oumaigou,1,Between €100K-€500K,23.0,2,43,2.0,40.0,5,38,inactive,62.0
3,Rosterize,1,https://www.eu-startups.com/directory/rosterize/,1,runple.com,1,Wien,7,Lower Austria,20.0,2022,31,FinTech/InsurTech,23,"['AI-assisted', 'aircrew planning', 'online ai...",1,B2B2C,15.0,qapture,1,Between €2.5 million-5 million,12.0,3,34,3.0,31.0,3,34,,
4,Runple,1,https://www.eu-startups.com/directory/runple/,1,seamox.com,1,Innsbruck,7,Tyrol,17.0,2018,22,Media & Entertainment,15,"['ERP', 'retail', 'e-commerce']",1,marketplace,10.0,rosterize,1,Between €500K-€ 1 million,11.0,5,23,4.0,12.0,2,31,,
5,SeaMoX Information Technologies,1,https://www.eu-startups.com/directory/seamox-i...,1,skinscreener.com,1,Salzburg,5,Salzburg,8.0,2017,21,Other,10,"['software', 'custom', 'IT-Services']",1,other,2.0,runple,1,Between €5 million-€10 million,9.0,4,21,5.0,8.0,4,19,,
6,SkinScreener,1,https://www.eu-startups.com/directory/skinscre...,1,sprad.io,1,vienna,3,Vorarlberg,7.0,2023,19,Hardware,10,"['skin cancer', 'risk assessment', 'AI']",1,,,seamox,1,Between €1-€100K,7.0,7,18,6.0,6.0,6,10,,
7,Sprad,1,https://www.eu-startups.com/directory/sprad/,1,sproof.io,1,Dornbirn,3,Carinthia,6.0,2024,15,AgTech/FoodTech,9,"['employee referrals', 'recruiting software', ...",1,,,skinscreener,1,Between €10 million-€25 million,2.0,6,18,7.0,1.0,7,4,,
8,sproof,1,https://www.eu-startups.com/directory/sproof/,1,getstaymate.com,1,Klagenfurt,3,Burgenland,3.0,2015,11,Mobility,9,"['document signing', 'digital signature', 'tru...",1,,,sproof,1,Above €25 million,1.0,8,18,,,8,1,,
9,Staymate,1,https://www.eu-startups.com/directory/staymate/,1,storeroom.at,1,Wiener Neustadt,2,,,2016,8,Education,6,"['guest communication', 'digital hotel assista...",1,,,surveysensum,1,,,9,11,,,10,1,,


In [10]:
df_final = df_final[df_final["status"]!="inactive"]

In [11]:
top10_data = {}

for col in df_final.columns:
    # include NaNs, then take the top 10
    vc = df_final[col] \
         .value_counts(dropna=False) \
         .head(10)

    # (optional) relabel the NaN index to "<Missing>"
    vc.index = vc.index.where(~vc.index.isna(), '<Missing>')

    top_values = vc.index.tolist()
    top_counts = vc.values.tolist()

    top10_data[col]       = pd.Series(top_values)
    top10_data[f"{col}_count"] = pd.Series(top_counts)

top10_df = pd.DataFrame(top10_data)

display(top10_df)

Unnamed: 0,name,name_count,link_startupeu,link_startupeu_count,link_website,link_website_count,city,city_count,region,region_count,founded_year,founded_year_count,category,category_count,top_3_keywords,top_3_keywords_count,business_model,business_model_count,linkedin_handle,linkedin_handle_count,total_funding_listing,total_funding_listing_count,headline_count,headline_count_count,headline_non_financial_count,headline_non_financial_count_count,headline_financial_count,headline_financial_count_count,status,status_count
0,Avdain,1,https://www.eu-startups.com/directory/avdain/,1,avdain.com,1,Vienna,193,Vienna,202.0,2020,85,Software & Analytics,165,"['Machine Learning', 'retail', 'demand forecas...",2,B2B,217.0,<Missing>,77,No funding announced yet,252.0,0,113,0.0,179.0,0,168,no_funding,215.0
1,Wood Space,1,https://www.eu-startups.com/directory/woodspace/,1,woodspace.com,1,Linz,26,Upper Austria,45.0,2019,81,Health,38,"['innovation', 'academic', 'entrepreneurial']",1,B2C,76.0,lunixo-digital-signage,1,Between €1 million-€ 2.5 million,22.0,1,45,1.0,61.0,5,34,funding,111.0
2,Read-Coop,1,https://www.eu-startups.com/directory/read-coop/,1,readcoop.eu,1,Graz,19,Styria,24.0,2021,47,Professional Services,37,"['ERP', 'SME', 'business processes']",1,platform,14.0,suppliot,1,Between €100K-€500K,18.0,2,35,2.0,35.0,3,33,,
3,Runple,1,https://www.eu-startups.com/directory/runple/,1,runple.com,1,Innsbruck,6,Lower Austria,17.0,2022,26,FinTech/InsurTech,17,"['ERP', 'retail', 'e-commerce']",1,B2B2C,12.0,store-room-gmbh,1,Between €2.5 million-5 million,12.0,3,28,3.0,25.0,1,32,,
4,SeaMoX Information Technologies,1,https://www.eu-startups.com/directory/seamox-i...,1,seamox.com,1,Wien,4,Tyrol,16.0,2017,20,Media & Entertainment,12,"['software', 'custom', 'IT-Services']",1,marketplace,6.0,getstaymate,1,Between €5 million-€10 million,9.0,5,21,4.0,12.0,2,27,,
5,SkinScreener,1,https://www.eu-startups.com/directory/skinscre...,1,skinscreener.com,1,Salzburg,4,Salzburg,7.0,2018,18,Hardware,9,"['skin cancer', 'risk assessment', 'AI']",1,other,1.0,sproof,1,Between €500K-€ 1 million,6.0,4,20,5.0,8.0,4,16,,
6,Sprad,1,https://www.eu-startups.com/directory/sprad/,1,sprad.io,1,Klagenfurt,3,Vorarlberg,6.0,2023,17,Other,8,"['employee referrals', 'recruiting software', ...",1,,,sprad-io,1,Between €1-€100K,4.0,7,18,6.0,5.0,6,10,,
7,sproof,1,https://www.eu-startups.com/directory/sproof/,1,sproof.io,1,vienna,3,Carinthia,6.0,2024,14,Mobility,8,"['document signing', 'digital signature', 'tru...",1,,,skinscreener,1,Between €10 million-€25 million,2.0,6,16,7.0,1.0,7,4,,
8,Staymate,1,https://www.eu-startups.com/directory/staymate/,1,getstaymate.com,1,Klosterneuburg,2,Burgenland,3.0,2015,10,AgTech/FoodTech,6,"['guest communication', 'digital hotel assista...",1,,,seamox,1,Above €25 million,1.0,8,15,,,8,1,,
9,STORE ROOM,1,https://www.eu-startups.com/directory/store-room/,1,storeroom.at,1,Niederosterreich,2,,,2016,8,PropTech,5,"['storage', 'smart', 'digitization']",1,,,runple,1,,,9,11,,,10,1,,


In [16]:
import pandas as pd

# build a boolean mask for your conditions
mask = (
    df_final['linkedin_handle'].isna()                                 # linkedin_handle is NA
    & (df_final['headline_count'] == 0)                                # headline_count == 0
    & (df_final['headline_non_financial_count'] == 0)                  # headline_non_financial_count == 0
    & (df_final['headline_financial_count'] == 0)                            # headline_financial == 0
    & (df_final['status'] == 'no_funding')                             # status == 'no funding'
)

# Option A: count by summing True values
n = mask.sum()
print(f"Number of rows matching criteria: {n}")

# Option B: select and get shape[0]
n2 = df_final.loc[mask].shape[0]
print(f"(verification) Number of rows matching criteria: {n2}")


Number of rows matching criteria: 50
(verification) Number of rows matching criteria: 50


In [18]:
# define the mask as before
mask = (
    df_final['linkedin_handle'].isna()
    & (df_final['headline_count'] == 0)
    & (df_final['headline_non_financial_count'] == 0)
    & (df_final['headline_financial_count'] == 0)
    & (df_final['status'] == 'no_funding')
)

# keep rows where mask is False
df_final = df_final[~mask].copy()


In [19]:
top10_data = {}

for col in df_final.columns:
    # include NaNs, then take the top 10
    vc = df_final[col] \
         .value_counts(dropna=False) \
         .head(10)

    # (optional) relabel the NaN index to "<Missing>"
    vc.index = vc.index.where(~vc.index.isna(), '<Missing>')

    top_values = vc.index.tolist()
    top_counts = vc.values.tolist()

    top10_data[col]       = pd.Series(top_values)
    top10_data[f"{col}_count"] = pd.Series(top_counts)

top10_df = pd.DataFrame(top10_data)

display(top10_df)

Unnamed: 0,name,name_count,link_startupeu,link_startupeu_count,link_website,link_website_count,city,city_count,region,region_count,founded_year,founded_year_count,category,category_count,top_3_keywords,top_3_keywords_count,business_model,business_model_count,linkedin_handle,linkedin_handle_count,total_funding_listing,total_funding_listing_count,headline_count,headline_count_count,headline_non_financial_count,headline_non_financial_count_count,headline_financial_count,headline_financial_count_count,status,status_count
0,SurveySensum,1,https://www.eu-startups.com/directory/surveyse...,1,surveysensum.com,1,Vienna,163,Vienna,172.0,2019,74,Software & Analytics,135,"['Machine Learning', 'retail', 'demand forecas...",2,B2B,182.0,<Missing>,27,No funding announced yet,202.0,0,63,0.0,129.0,0,118,no_funding,165.0
1,Qapture,1,https://www.eu-startups.com/directory/qapture/,1,qapture.at,1,Linz,22,Upper Austria,35.0,2020,69,Professional Services,34,"['customer feedback', 'CX', 'business goals']",1,B2C,65.0,legitary,1,Between €1 million-€ 2.5 million,22.0,1,45,1.0,61.0,5,34,funding,111.0
2,Legitary,1,https://www.eu-startups.com/directory/legitary/,1,legitary.com,1,Graz,19,Styria,23.0,2021,41,Health,33,"['digital twins', 'reality capture', 'laser sc...",1,platform,12.0,store-room-gmbh,1,Between €100K-€500K,18.0,2,35,2.0,35.0,3,33,,
3,Lunixo,1,https://www.eu-startups.com/directory/lunixo/,1,lunixo.com,1,Innsbruck,6,Lower Austria,16.0,2022,22,FinTech/InsurTech,15,"['AI', 'royalties', 'music']",1,B2B2C,11.0,getstaymate,1,Between €2.5 million-5 million,12.0,3,28,3.0,25.0,1,32,,
4,MADiscover,1,https://www.eu-startups.com/directory/madiscover/,1,madiscover.com,1,Wien,4,Tyrol,12.0,2017,19,Media & Entertainment,11,"['digital signage', 'multimedia player', 'cont...",1,marketplace,5.0,sproof,1,Between €5 million-€10 million,9.0,5,21,4.0,12.0,2,27,,
5,Novasign,1,https://www.eu-startups.com/directory/novasign/,1,novasign.at,1,Salzburg,4,Salzburg,7.0,2018,18,Hardware,8,"['M&A', 'AI', 'analytics']",1,other,1.0,sprad-io,1,Between €500K-€ 1 million,6.0,4,20,5.0,8.0,4,16,,
6,OTEREA,1,https://www.eu-startups.com/directory/oterea/,1,oterea.com,1,vienna,3,Carinthia,6.0,2023,12,Mobility,8,"['machine learning', 'biotech', 'bioprocess de...",1,,,skinscreener,1,Between €1-€100K,4.0,7,18,6.0,5.0,6,10,,
7,OuMaiGou,1,https://www.eu-startups.com/directory/oumaigou/,1,oumaigou.eu,1,Klagenfurt,3,Vorarlberg,4.0,2015,8,AgTech/FoodTech,6,"['real estate', 'advisory', 'sustainability']",1,,,seamox,1,Between €10 million-€25 million,2.0,6,16,7.0,1.0,7,4,,
8,Read-Coop,1,https://www.eu-startups.com/directory/read-coop/,1,readcoop.eu,1,Spittal an der Drau,2,Burgenland,1.0,2016,8,Energy,5,"['European products', 'Chinese consumers', 'E-...",1,,,runple,1,Above €25 million,1.0,8,15,,,8,1,,
9,SUPPLiot,1,https://www.eu-startups.com/directory/suppliot/,1,suppliot.eu,1,Niederosterreich,2,,,2024,5,ConstructionTech/Green Building,4,"['historical documents', 'handwritten text rec...",1,,,qapture,1,,,9,11,,,10,1,,


In [21]:
import pandas as pd

# 1. Define your desired orders
region_order = [
    'Vienna',
    'Lower Austria',
    'Upper Austria',
    'Styria',
    'Carinthia',
    'Salzburg',
    'Tyrol',
    'Vorarlberg'
]
status_order = ['funding', 'no_funding']

# 2. Convert the columns to ordered Categoricals
df_final['region'] = pd.Categorical(
    df_final['region'],
    categories=region_order,
    ordered=True
)
df_final['status'] = pd.Categorical(
    df_final['status'],
    categories=status_order,
    ordered=True
)

# 3. Sort by region first, then by status
df_sorted = df_final.sort_values(
    by=['region', 'status']
).reset_index(drop=True)



In [23]:
# define the list of columns you want to keep
cols_to_keep = [
    'name',
    'link_website',
    'city',
    'region',
    'founded_year',
    'category',
    'top_3_keywords',
    'business_model',
    'linkedin_handle',
    'status'
]

# select only those columns (and make a copy if you plan to modify it)
df_sorted = df_sorted[cols_to_keep].copy()


In [24]:
df_sorted

Unnamed: 0,name,link_website,city,region,founded_year,category,top_3_keywords,business_model,linkedin_handle,status
0,DAIKI,dai.ki,Vienna,Vienna,2023,Software & Analytics,"['AI', 'compliance', 'governance']",B2B,daiki-ai,funding
1,infrared.city,infrared.city,Vienna,Vienna,2023,Software & Analytics,"['intelligent', 'resilient', 'design']",platform,infrared-city,funding
2,refinq,refinq.com,Vienna,Vienna,2023,Software & Analytics,"['Biodiversity', 'Ecosystems', 'Climate Risk']",B2B,refinq,funding
3,blue auditor,blueauditor.com,Vienna,Vienna,2017,Software & Analytics,"['sustainability', 'real estate', 'management ...",B2B,blue-auditor,funding
4,Necture,necture.com,Vienna,Vienna,2015,Professional Services,"['fleets', 'sustainability', 'solutions']",B2B,necture,funding
5,SchuBu Systems,schubu.systems,Vienna,Vienna,2020,e-commerce,"['interactive textbook', 'digital lessons', 't...",B2B,schubuwien,funding
6,Boolee,boolee.io,Vienna,Vienna,2023,Software & Analytics,"['data analysis', 'SaaS', 'business analysts']",B2B,boolee,funding
7,Glasskube,glasskube.eu,Vienna,Vienna,2023,Software & Analytics,"['infrastructure', 'automation', 'open source']",B2B,glasskube,funding
8,silana,wearesilana.com,Vienna,Vienna,2022,Hardware,"['automated', 'fashion', 'production']",B2B,silana,funding
9,HeldYn,heldyn.com,Vienna,Vienna,2022,Health,"['care', 'therapy', 'healthcare']",B2C,heldyn,funding


In [25]:
df_sorted.to_csv("./data/startups_lumos_sales.csv", index=False)