#1. Importing Necessary Libraries

In [1]:
# Import necessary libraries
import requests
import pandas as pd
from bs4 import BeautifulSoup

# requests: to fetch the web page content.
# pandas: to handle and save data in a structured format (Excel).
# BeautifulSoup: to parse HTML and extract specific information from the page.

#2. Scraping Company Data

In [None]:
def scrape_company_data(base_url):
    """
    This function scrapes company data from a given URL.
    
    Args:
        base_url (str): The base URL of the website to scrape.
    
    Returns:
        list: A list of dictionaries containing company details (name, description, and stand).
    """
    # List to store all the scraped company data
    all_companies_data = []
    
    # Start with the first page of companies
    page_number = 1
    
    # Loop through all available pages until no more companies are found
    while True:
        # Construct the URL for the current page
        url = f"{base_url}?page={page_number}"
        response = requests.get(url)
        
        # Check if the page was fetched successfully
        if response.status_code != 200:
            print(f"Failed to retrieve page {page_number}, status code: {response.status_code}")
            break
        
        # Parse the HTML content using BeautifulSoup
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Find all the list items that contain company information
        list_items = soup.find_all('li', class_='m-exhibitors-list__items__item')
        
        # If no more companies are found, stop scraping
        if not list_items:
            print("No more companies found, reached the last page.")
            break
        
        # Loop through each company item and extract the data
        for item in list_items:
            # Extract company name
            name_tag = item.find('h2', class_='m-exhibitors-list__items__item__header__title')
            name = name_tag.text.strip() if name_tag else "No name available"
            
            # Extract the stand information
            stand_tag = item.find('div', class_='m-exhibitors-list__items__item__header__meta__stand')
            stand = stand_tag.text.replace("Stand:", "").strip() if stand_tag else "No stand available"
            
            # Extract company description
            description_tag = item.find('div', class_='m-exhibitors-list__items__item__body__description')
            description = description_tag.text.strip() if description_tag else "No description available"
            
            # Compile the extracted information into a dictionary
            company_info = {
                "Name": name,
                "Description": description,
                "Stand": stand
            }
            
            # Append the company info to the list of all companies
            all_companies_data.append(company_info)
        
        # Move to the next page
        page_number += 1
    
    # Return the complete list of scraped companies
    return all_companies_data

# Base URL for scraping
base_url = "https://london.vetshow.com/exhibitor-list"

# Run the scraping function
companies_data = scrape_company_data(base_url)

#3. Saving the Data to Excel

In [None]:
def save_to_excel(data, filename):
    """
    This function saves the scraped data into an Excel file.
    
    Args:
        data (list): A list of dictionaries containing company details.
        filename (str): The name of the output Excel file.
    """
    # Convert the list of dictionaries to a Pandas DataFrame
    df = pd.DataFrame(data)
    
    # Add an index starting from 1
    df.index += 1
    df.index.name = 'Index'
    
    # Save the DataFrame to an Excel file
    df.to_excel(filename, index=True)

# Save the scraped data to an Excel file named 'LondonVetShowData.xlsx'
save_to_excel(companies_data, 'Output/LondonVetShowData.xlsx')

print("Data collection complete. Saved to 'Output/LondonVetShowData.xlsx'.")