In [2]:
import requests
from bs4 import BeautifulSoup
import time
import random
import pandas as pd

import re

## Helper Functions

In [None]:
# Function to clean unusual line terminators
def clean_unusual_terminators(text):
    if isinstance(text, str):
        return text.replace('\u2028', ' ').replace('\u2029', ' ')
    return text

## EU - Startups Scraping

In [2]:
def extract_listings_from_page(url, headers=None):

    response = requests.get(url, headers=headers)
    response.raise_for_status()  # Catch any HTTP errors

    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Identify all listing containers by matching their id pattern.
    listings = soup.find_all('div', id=lambda value: value and value.startswith("wpbdp-listing-"))
    data = []

    for listing in listings:
        # Initialize a dictionary for this listing
        listing_data = {
            'name': None,
            'link': None,
            'category': None,
            'based_in': None,
            'tags': None,
            'founded': None
        }
        
        # 1. Extract Name and Link from the listing-title block.
        title_div = listing.find('div', class_='listing-title')
        if title_div:
            a_tag = title_div.find('a')
            if a_tag:
                listing_data['name'] = a_tag.get_text(strip=True)
                listing_data['link'] = a_tag.get('href')
        
        # 2. Extract details from the listing-details block.
        details_div = listing.find('div', class_='listing-details')
        if details_div:
            # Each field is in a div with class including "wpbdp-field-display"
            field_divs = details_div.find_all('div', class_=lambda x: x and "wpbdp-field-display" in x)
            for field in field_divs:
                label_span = field.find('span', class_='field-label')
                if not label_span:
                    continue
                # Normalize the label for matching
                label = label_span.get_text(strip=True).rstrip(':').lower()
                
                value_div = field.find('div', class_='value')
                value = value_div.get_text(strip=True) if value_div else None

                if label == 'business name':
                    # Name already obtained above.
                    continue
                elif label == 'category':
                    listing_data['category'] = value
                elif label == 'based in':
                    listing_data['based_in'] = value
                elif label == 'tags':
                    listing_data['tags'] = value
                elif label == 'founded':
                    listing_data['founded'] = value
        
        data.append(listing_data)
    
    return data

In [3]:
# Main scraping loop for multiple pages
base_url = "https://www.eu-startups.com/directory/wpbdp_category/austrian-startups/"
all_listings = []

# Define headers with a custom User-Agent to mimic a real browser.
headers = {
    'User-Agent': 'Mozilla/5.0 (compatible; MyScraper/1.0; +http://example.com/contact)'
}

# There are 66 pages
for page in range(1, 67):
    # Construct the URL: for page 1, use the base URL; for others, append the page number.
    if page == 1:
        url = base_url
    else:
        url = f"{base_url}page/{page}/"
        
    print(f"Scraping: {url}")
    
    try:
        page_listings = extract_listings_from_page(url, headers=headers)
        all_listings.extend(page_listings)
    except Exception as e:
        print(f"Error scraping {url}: {e}")
    
    # Implement rate limiting: wait 2-5 seconds before the next request.
    delay = random.uniform(2, 5)
    print(f"Sleeping for {delay:.2f} seconds...")
    time.sleep(delay)

# At this point, all_listings contains your desired data from all pages.
print("Total listings extracted:", len(all_listings))
print(all_listings)  # You can further process or store the data as needed.

Scraping: https://www.eu-startups.com/directory/wpbdp_category/austrian-startups/
Sleeping for 3.36 seconds...
Scraping: https://www.eu-startups.com/directory/wpbdp_category/austrian-startups/page/2/
Sleeping for 3.22 seconds...
Scraping: https://www.eu-startups.com/directory/wpbdp_category/austrian-startups/page/3/
Sleeping for 4.44 seconds...
Scraping: https://www.eu-startups.com/directory/wpbdp_category/austrian-startups/page/4/
Sleeping for 2.40 seconds...
Scraping: https://www.eu-startups.com/directory/wpbdp_category/austrian-startups/page/5/
Sleeping for 3.83 seconds...
Scraping: https://www.eu-startups.com/directory/wpbdp_category/austrian-startups/page/6/
Sleeping for 3.50 seconds...
Scraping: https://www.eu-startups.com/directory/wpbdp_category/austrian-startups/page/7/
Sleeping for 4.58 seconds...
Scraping: https://www.eu-startups.com/directory/wpbdp_category/austrian-startups/page/8/
Sleeping for 4.84 seconds...
Scraping: https://www.eu-startups.com/directory/wpbdp_category/

In [None]:
df = pd.DataFrame.from_dict(all_listings)

df.drop_duplicates(subset=None, keep='first', inplace=True, ignore_index=False)

df.drop_duplicates(subset=['link'], keep='first', inplace=True)

df.reset_index(drop=True, inplace=True)

df = df.applymap(clean_unusual_terminators)

#df.to_csv("data/eustartup_listings.csv", index=False)

#df_all_listings = pd.read_csv("data/eustartup_listings.csv")

df_all_listings = df.copy()

## EU Startups Individual Listings Scraping

In [5]:
def extract_listing_details(url, headers=None):

    details = {
        'business_name': None,
        'logo_link': None,
        'long_business_description': None,
        'business_description': None,
        'total_funding': None,
        'website': None,
        'company_status': None,
        'social_links': []  # using a list to store all social URLs
    }
    
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()
    except Exception as e:
        print(f"Error fetching {url}: {e}")
        return details

    soup = BeautifulSoup(response.content, 'html.parser')
    
    # 1. Business Name: Try two possible locations.
    # Option A: The page header (if available)
    header = soup.find('div', class_='td-page-header')
    if header:
        h1 = header.find('h1', class_='entry-title td-page-title')
        if h1:
            span = h1.find('span')
            if span:
                details['business_name'] = span.get_text(strip=True)
    # Option B: Fallback to the listing title in the content
    if not details['business_name']:
        listing_title = soup.find('div', class_='listing-title')
        if listing_title:
            h2 = listing_title.find('h2')
            if h2:
                details['business_name'] = h2.get_text(strip=True)
    
    # 2. Logo Link: Look for the listing-thumbnail block
    thumbnail_div = soup.find('div', class_='listing-thumbnail')
    if thumbnail_div:
        a_tag = thumbnail_div.find('a')
        if a_tag:
            img_tag = a_tag.find('img')
            if img_tag and img_tag.get('src'):
                details['logo_link'] = img_tag.get('src')
    
    # Helper function: Given a unique part of a class name, extract the text value.
    def extract_field(field_identifier):
        field_div = soup.find('div', class_=lambda x: x and field_identifier in x)
        if field_div:
            value_div = field_div.find('div', class_='value')
            if value_div:
                return value_div.get_text(" ", strip=True)
        return None

    # 3. Long Business Description
    details['long_business_description'] = extract_field('wpbdp-field-long_business_description')

    # 4. Business Description
    details['business_description'] = extract_field('wpbdp-field-business_description')
    
    # 5. Total Funding
    details['total_funding'] = extract_field('wpbdp-field-total_funding')

    # 6. Website
    details['website'] = extract_field('wpbdp-field-website')
    
    # 7. Company Status
    details['company_status'] = extract_field('wpbdp-field-company_status')
    
    # 8. Social Links: Look for container (it might contain several <a> tags)
    social_container = soup.find('div', class_='social-fields')
    if social_container:
        # Collect all anchor tags found inside the social container
        anchor_tags = social_container.find_all('a')
        for a in anchor_tags:
            href = a.get('href')
            if href:
                details['social_links'].append(href)
    
    return details

In [6]:

# Prepare an empty list to hold detailed rows
detailed_rows = []

# Custom headers for your HTTP requests
headers = {
    'User-Agent': 'Mozilla/5.0 (compatible; MyScraper/1.0; +http://example.com/contact)'
}

# Iterate over each row in the DataFrame
for idx, row in df_all_listings.iterrows():
    listing_url = row['link']
    print(f"Scraping details from: {listing_url}")
    
    # Extract additional details from the listing page
    details = extract_listing_details(listing_url, headers=headers)
    
    # Merge the extracted details with the existing row.
    # Convert the row to a dictionary and update with details.
    combined_row = row.to_dict()
    combined_row.update(details)
    detailed_rows.append(combined_row)
    
    # Rate limiting: wait for a random interval between 2 and 5 seconds
    delay = random.uniform(2, 3)
    print(f"Sleeping for {delay:.2f} seconds...")
    time.sleep(delay)

# Convert the list of detailed rows back into a DataFrame
df_detailed = pd.DataFrame(detailed_rows)

Scraping details from: https://www.eu-startups.com/directory/avdain/
Sleeping for 2.14 seconds...
Scraping details from: https://www.eu-startups.com/directory/pdf-to-brainrot/
Error fetching https://www.eu-startups.com/directory/pdf-to-brainrot/: 404 Client Error: Not Found for url: https://www.eu-startups.com/directory/pdf-to-brainrot/
Sleeping for 2.86 seconds...
Scraping details from: https://www.eu-startups.com/directory/softgen/
Sleeping for 2.54 seconds...
Scraping details from: https://www.eu-startups.com/directory/popper-power-gmbh/
Sleeping for 2.11 seconds...
Scraping details from: https://www.eu-startups.com/directory/setter-ai/
Sleeping for 2.98 seconds...
Scraping details from: https://www.eu-startups.com/directory/surveysensum/
Sleeping for 2.21 seconds...
Scraping details from: https://www.eu-startups.com/directory/artypa/
Sleeping for 2.76 seconds...
Scraping details from: https://www.eu-startups.com/directory/share-your-party/
Sleeping for 2.01 seconds...
Scraping deta

In [None]:
df_dl = pd.DataFrame(detailed_rows)

df_dl['social_links'] = df_dl['social_links'].apply(lambda x: tuple(x) if isinstance(x, list) else x)

df_dl.drop_duplicates(keep='first', inplace=True)

df_dl.drop_duplicates(subset=['link'], keep='first', inplace=True)

df_dl.reset_index(drop=True, inplace=True)

df_dl = df_dl.applymap(clean_unusual_terminators)

#df_dl.to_csv("data/eustartup_listings.csv", index=False)

df_detailed_listings = pd.read_csv("data/eustartup_listings.csv")

df_detailed_listings

Unnamed: 0,name,link,category,based_in,tags,founded,business_name,logo_link,long_business_description,business_description,total_funding,website,company_status,social_links
0,Avdain,https://www.eu-startups.com/directory/avdain/,Austria,Vienna,"Company, Startup, One Person",2020.0,Avdain,https://www.eu-startups.com/wp-content/uploads...,Avdain is a enterprise that embodies a fusion ...,Avdain is an technology company founded and so...,No funding announced yet,avdain.com,Active,()
1,PDF To Brainrot,https://www.eu-startups.com/directory/pdf-to-b...,Austria,"245 Wo Lung Street, Fanling, North District, H...","AI,Video,Learning",2024.0,,,,,,,,()
2,Softgen,https://www.eu-startups.com/directory/softgen/,Austria,Austria,"AI code assistant, full stack developer",2024.0,Softgen,https://www.eu-startups.com/wp-content/uploads...,"Beyond a starter: Your complete project, ready...",Softgen is your AI Web App Developer. Describe...,No funding announced yet,https://softgen.ai/,Active,()
3,Popper Power GmbH,https://www.eu-startups.com/directory/popper-p...,Austria,Vienna,"EV, Battery, BESS, Charging",2022.0,Popper Power GmbH,https://www.eu-startups.com/wp-content/uploads...,,Popper Power GmbH develops advanced energy sto...,Between €500K-€ 1 million,www.popperpower.com,Active,"('https://www.linkedin.com/company/86313916',)"
4,Setter AI,https://www.eu-startups.com/directory/setter-ai/,Austria,Wien,"AI, AI Agents, Sales & Marketing, AI SaaS, AI ...",2024.0,Setter AI,https://www.eu-startups.com/wp-content/uploads...,Speed matters when you want more sales. That’s...,Easy-to-use AI appointment setter for WhatsApp...,Between €1-€100K,https://www.trysetter.com,Active,()
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
697,Runtastic,https://www.eu-startups.com/directory/runtastic/,Austria,Linz,"Activity Tracker, Fitness app, Healthcare",2009.0,Runtastic,https://www.eu-startups.com/wp-content/uploads...,,runtastic helps you to track your fitness acti...,,http://www.runtastic.com,,()
698,toolani,https://www.eu-startups.com/directory/toolani/,Austria,Vienna,"Online Calls Software, Connectivity, Telecom",2008.0,toolani,https://www.eu-startups.com/wp-content/uploads...,,toolani offers cheap international calling to ...,,https://www.toolani.com,,()
699,Matchoffice Österreich,https://www.eu-startups.com/directory/matchoff...,Austria,Wien,"Office rentals, Business centres",2008.0,Matchoffice Österreich,https://www.eu-startups.com/wp-content/uploads...,Thanks to our many years of experience with of...,MatchOffice is a recognised player for the pla...,No funding announced yet,https://www.matchoffice.at,Active,()
700,Kununu,https://www.eu-startups.com/directory/kununu/,Austria,Vienna,"Anonymous Feedback, Companies Rating, Reviews ...",2007.0,Kununu,https://www.eu-startups.com/wp-content/uploads...,,The Austria based company kununu offers a plat...,,http://www.kununu.com,,()
