# Directory Crawling


In [5]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import time
import pandas as pd
import os
import re


SECTOR = "wine" # Sector to search for
OUT_DIR = "outputs" ## Folder to save output files
os.makedirs(OUT_DIR, exist_ok=True)

START_URL = "https://www.europages.co.uk/en/search?q=Wine+producers"
MAX_PAGES = 5 # Number of pages to crawl

# Setup headless Chrome (browser runs without opening a window)
options = Options()
options.add_argument("--headless")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
driver = webdriver.Chrome(options=options)

def extract_company_links(html):
    """Extracts company profile links from a given HTML page."""
    soup = BeautifulSoup(html, "lxml")
    links = set()

    # Pattern 1: Old format with .html
    pattern_old = re.compile(
        r"^https://www\.europages\.co\.uk/[^/]+/[A-Z0-9]+-\d+\.html$", re.IGNORECASE
    )

    # Pattern 2: New format without .html
    pattern_new = re.compile(
        r"^https://www\.europages\.co\.uk/en/company/[a-z0-9-]+-\d+$", re.IGNORECASE
    )

    # Find all <a> tags and check if they match company profile links
    for a in soup.find_all("a", href=True):
        href = a["href"].strip()
        if href.startswith("/"): # Convert relative links to full URLs
            href = "https://www.europages.co.uk" + href

        # Add link if it matches old or new format
        if pattern_old.match(href) or pattern_new.match(href):
            links.add(href)

    return links


def crawl_directory(start_url, max_pages):
        """Crawl directory pages and collect all company profile links."""

    all_links = set()
    page_url = start_url

    for page in range(1, max_pages + 1):
        print(f"[crawl] Page {page}: {page_url}")
        driver.get(page_url)
        time.sleep(3)  # # Wait for JavaScript content to load
        html = driver.page_source
        links = extract_company_links(html)
        print(f"  found {len(links)} links")
        all_links.update(links)
        
        # Construct next page URL (pagination handling)
        page_url = f"https://www.europages.co.uk/en/search/page/{page+1}?q=Wine+producers"

    print(f"[done] Collected {len(all_links)} unique links")
    return sorted(all_links)

def export_links_csv(links, sector):
    """Save profile links to CSV file."""
    path = os.path.join(OUT_DIR, f"links_{sector}.csv")
    pd.DataFrame({"url": links}).to_csv(path, index=False)
    print(f"[save] {path}")

if __name__ == "__main__":
    try:
        # Crawl directory and extract links
        links = crawl_directory(START_URL, MAX_PAGES)
        
        # Save links to CSV
        export_links_csv(links, SECTOR)
    finally:
        # Close the browser after execution
        driver.quit()


[crawl] Page 1: https://www.europages.co.uk/en/search?q=Wine+producers
  found 30 links
[crawl] Page 2: https://www.europages.co.uk/en/search/page/2?q=Wine+producers
  found 30 links
[crawl] Page 3: https://www.europages.co.uk/en/search/page/3?q=Wine+producers
  found 30 links
[crawl] Page 4: https://www.europages.co.uk/en/search/page/4?q=Wine+producers
  found 30 links
[crawl] Page 5: https://www.europages.co.uk/en/search/page/5?q=Wine+producers
  found 30 links
[done] Collected 150 unique links
[save] outputs\links_wine.csv


# Scrape company profile: name, country, and emails.

In [7]:
import pandas as pd
import re
import time
import os
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
from urllib.parse import urljoin

# Setup
SECTOR = "wine" # Sector to search for
OUT_DIR = "outputs" # Folder to save results
LINKS_CSV = os.path.join(OUT_DIR, f"links_{SECTOR}.csv") # Input file containing company profile links
EMAILS_CSV = os.path.join(OUT_DIR, f"emails_{SECTOR}.csv") # Output file to save emails

# Setup Chrome WebDriver in headless mode (no browser window)
options = Options()
options.add_argument("--headless")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
driver = webdriver.Chrome(options=options)

def extract_company_info(soup):
    """Extract company name and country from company profile page."""
    # Get company name from <a> with class 'company-name'
    company_name_tag = soup.find("a", class_="company-name")
    company_name = company_name_tag.text.strip() if company_name_tag else "N/A"
    
    # Get country from div with class 'flex gap-1 items-center mt-0.5'
    country_div = soup.find("div", class_="flex gap-1 items-center mt-0.5")
    country = "N/A"
    if country_div:
        spans = country_div.find_all("span")
        if len(spans) > 1:
            country = spans[1].text.strip()
    return company_name, country

def get_email_from_page(url):
    """Extract multiple emails from a given page (excluding generic ones)."""
    driver.get(url)
    time.sleep(2) # wait for page load
    soup = BeautifulSoup(driver.page_source, "lxml")
    text = soup.get_text(" ", strip=True)

    # Find emails using regex
    emails = re.findall(r'[\w\.-]+@[\w\.-]+\.\w+', text)
    
    # Filter out generic emails
    filtered = [e for e in emails if not e.lower().startswith(("info@", "contact@", "sales@", "admin@"))]

    # Return unique list of emails (prefer filtered, fallback to all emails)
    return list(set(filtered if filtered else emails))

def find_contact_or_impressum_link(soup):
    """Find links that may lead to contact/impressum pages in multiple languages."""
    keywords = ["contact", "kontakt", "contatti", "scrivici", "impressum"]
    for a in soup.find_all("a", href=True):
        text = a.get_text(strip=True).lower()
        href = a['href'].lower()
        if any(k in text for k in keywords) or any(k in href for k in keywords):
            return a['href']
    return None

def scrape_profile(url):
    """Scrape company profile: name, country, and emails."""

    print(f"Scraping: {url}")
    try:
        # Load profile page
        driver.get(url)
        time.sleep(3)
        soup = BeautifulSoup(driver.page_source, "lxml")

        # Extract company name and country
        company_name, country = extract_company_info(soup)

        # Find 'Visit website' link
        website_link = None
        for a in soup.find_all("a", href=True):
            if a.text.strip().lower().startswith("visit website"):
                website_link = a['href']
                break
        if not website_link:
            print(f"Visit website link not found for {company_name}")
            return None

        # Navigate to company website
        website_url = urljoin(url, website_link)
        driver.get(website_url)
        time.sleep(3)
        website_soup = BeautifulSoup(driver.page_source, "lxml")

        # Find contact or impressum link
        contact_link = find_contact_or_impressum_link(website_soup)
        if contact_link:
            contact_url = urljoin(website_url, contact_link)
        else:
            contact_url = website_url  # fallback

        # Try extracting email from contact page
        emails = get_email_from_page(contact_url)

        # If still no email, check Impressum page explicitly
        if not emails:
            for a in website_soup.find_all("a", href=True):
                if "impressum" in a['href'].lower():
                    impressum_url = urljoin(website_url, a['href'])
                    emails = get_email_from_page(impressum_url)
                    break

        if company_name and emails:
            return {"Name": company_name, "Country": country, "Email": "; ".join(emails)}
        else:
            print(f"Incomplete data for {company_name} at {url}")
            return None

    except Exception as e:
        print(f"Error scraping {url}: {e}")
        return None

def main():
    """Main function: read company links, scrape profiles, and save results."""
    # Read company profile links from CSV
    df_links = pd.read_csv(LINKS_CSV)
    profile_urls = df_links["url"].drop_duplicates().tolist()

    results = []
    for url in profile_urls:
        data = scrape_profile(url)
        if data:
            results.append(data)
            
    # Save results to CSV (avoid duplicates by email)
    df = pd.DataFrame(results)
    df.drop_duplicates(subset=["Email"], inplace=True)
    if not os.path.exists(OUT_DIR):
        os.makedirs(OUT_DIR)
    df.to_csv(EMAILS_CSV, index=False)
    print(f"Saved output to {EMAILS_CSV}")

if __name__ == "__main__":
    main()
    driver.quit()


Scraping: https://www.europages.co.uk/ACRI-WINE-SERVICE/00000005388822-707741001.html
Incomplete data for ACRI WINE SERVICE at https://www.europages.co.uk/ACRI-WINE-SERVICE/00000005388822-707741001.html
Scraping: https://www.europages.co.uk/ADEGA-COOPERATIVA-DE-ALMEIRIM-CRL/PRT013955-00101.html
Scraping: https://www.europages.co.uk/ADEGA-COOPERATIVA-DE-PONTE-DE-BARCA-CRL/00000004477533-311265001.html
Incomplete data for ADEGA COOPERATIVA DE PONTE DE BARCA, CRL at https://www.europages.co.uk/ADEGA-COOPERATIVA-DE-PONTE-DE-BARCA-CRL/00000004477533-311265001.html
Scraping: https://www.europages.co.uk/AG-VINS/00000005521534-001.html
Scraping: https://www.europages.co.uk/AMORIM-CORK/PRT004475-00101.html
Scraping: https://www.europages.co.uk/ARTEBOTTI-DI-MAURIZIO-CANNATA/00000005401987-733065001.html
Scraping: https://www.europages.co.uk/BARRISPT/00000005420863-756922001.html
Scraping: https://www.europages.co.uk/BASTIDE-DE-FAVE/00000005451877-794868001.html
Scraping: https://www.europages.co