In [11]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

In [12]:
def extract_brand_info(response, category_name):
    # Parse the HTML content
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Initialize a dictionary to store details under the provided category name
    brand_info_category = {category_name: []}
    
    # Find all brandname elements
    brand_names = soup.find_all('div', class_='branddesc')

    # Iterate over each brand block
    for brand in brand_names:
        try:
            # Extract the brand name and company
            marca = brand.find('span', class_='color4').text.strip()
            company = brand.find('span', class_='c666').text.strip()

            # Find the corresponding brandinfo section
            brand_intro = brand.find('div', class_='history').text.strip()
            brand_info = brand.find_next_sibling('div', class_='brandinfo font15')
            
            # Extract the relevant details
            legal_representative = brand_info.find('div', class_='c999 brandfaren').find('span', class_='color2').text.strip()
            registered_capital = brand_info.find('div', class_='c999 brandziben').find('span', class_='color2').text.strip()
            foundation_date = brand_info.find('div', class_='c999 brandtime').find('span', class_='color2').text.strip()
            
            # Create a dictionary with the extracted information
            brand_details = {
                'brand_name': marca,
                'company_name': company,
                'legal_representative': legal_representative,
                'registered_capital': registered_capital,
                'foundation_date': foundation_date,
                'brand_intro': brand_intro
            }
            
            # Add the brand details to the specified category list
            brand_info_category[category_name].append(brand_details)
        
        except AttributeError:
            # Handle cases where the information might be missing or the structure is different
            continue
    
    return brand_info_category

In [13]:
def fetch_url_with_retries(url, retries=3, delay=5):
    for attempt in range(retries):
        try:
            response = requests.get(url, timeout=10)
            if response.status_code == 200:
                return response
            else:
                print(f"Attempt {attempt + 1} failed: Status code {response.status_code}")
        except requests.exceptions.RequestException as e:
            print(f"Attempt {attempt + 1} failed: {e}")
        
        time.sleep(delay)
    
    return None

In [14]:
def process_urls_from_csv(csv_file, output_excel='brand_info.xlsx'):
    # Read the CSV file to get the URLs and categories
    urls_df = pd.read_csv(csv_file)
    
    # Initialize an empty list to store all brand information
    all_data = []
    
    # Iterate over each URL and category in the CSV
    for index, row in urls_df.iterrows():
        url = row['url']
        category = row['category']  # The specific category for this URL
        
        # Fetch the HTML content from the URL with retries
        response = fetch_url_with_retries(url)
        
        if response:
            # Extract brand information for this URL under the specified category
            brand_info_category = extract_brand_info(response, category)
            
            # Add category name and URL to the extracted data
            for brand in brand_info_category[category]:
                brand['category'] = category
                brand['url'] = url
                all_data.append(brand)
        else:
            print(f"Failed to retrieve the webpage at {url} after multiple attempts.")
    
    # Save all the collected data to an Excel file
    save_to_excel(all_data, output_excel)

def save_to_excel(data, file_name='brand_info.xlsx'):
    # Convert the list of dictionaries to a DataFrame
    df = pd.DataFrame(data)
    
    # Save the DataFrame to an Excel file
    df.to_excel(file_name, index=False)
    
    print(f"Data saved to {file_name}")

In [None]:
# Process URLs from the CSV file
process_urls_from_csv('urls.csv')
